diff --git a/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala
new file mode 100644
index 00000000..d0c19926
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object AudioInformationExtractor {
+  /** Extract information about audio files from web archive using
+    * DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, url,
+    *   mime_type_web_server, mime_type_tika, language, content)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    // scalastyle:off
+    import spark.implicits._
+    // scalastyle:on
+    d.select($"url",
+             $"filename",
+             $"extension",
+             $"mime_type_web_server",
+             $"mime_type_tika",
+             $"md5",
+             $"sha1")
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
index 2838b96d..18f96866 100644
--- a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
+++ b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
@@ -34,26 +34,32 @@ import org.rogach.scallop.ScallopConf
  *   --input INPUT_FILE ...
  *   --output OUTPUT_DIRECTORY
  *   [--output-format FORMAT]
- *   [--df]
  *   [--split]
  *   [--partiton]
  *
  * where EXTRACTOR is one of
- * DomainFrequencyExtractor, DomainGraphExtractor or PlainTextExtractor
+ * AudioInformationExtractor, DomainFrequencyExtractor, DomainGraphExtractor,
+ * ImageGraphExtractor, ImageInformationExtractor, PDFInformationExtractor,
+ * PlainTextExtractor, PresentationProgramInformationExtractor,
+ * SpreadsheetInformationExtractor, TextFilesInformationExtractor,
+ * VideoInformationExtractor, WebGraphExtractor, WebPagesExtractor,
+ * or WordProcessorInformationExtractor.
  *
  * INPUT_FILE is a list of input files separated by space (or path containing wildcard)
  * OUTPUT_DIRECTORY is the directory to put result in
  *
  * FORMAT is meant to work with DomainGraphExtractor
- * Three supported options are CSV (default), GEXF, or GRAPHML
+ * Four supported options are csv (default), and gexf, graphml as
+ * additional options for DomainGraphExtractor.
  *
- * If --split is present, the program will put results for each input file in its own folder. Otherwise they will be merged.
+ * If --split is present, the program will put results for each input file in its own folder.
+ * Otherwise they will be merged.
  *
- * If --partition N is present, the program will partition the DataFrame according to N before writing results.
- * Otherwise, the partition is left as is.
+ * If --partition N is present, the program will partition the DataFrame according
+ * to N before writing results. Otherwise, the partition is left as is.
  */
 
-/** Construct a Scallop option reader from command line argument string list
+/** Construct a Scallop option reader from command line argument string list.
   *
   * @param args list of command line arguments passed as is from argv
   */
@@ -79,7 +85,7 @@ class CmdAppConf(args: Seq[String]) extends ScallopConf(args) {
   val input = opt[List[String]](descr = "input file path", required = true)
   val output = opt[String](descr = "output directory path", required = true)
   val outputFormat = opt[String](descr =
-    "output format for DomainGraphExtractor, one of CSV, GEXF, or GRAPHML")
+    "output format for DomainGraphExtractor, one of csv, gexf, or graphml")
   val split = opt[Boolean]()
   val partition = opt[Int]()
   verify()
@@ -105,6 +111,14 @@ class CommandLineApp(conf: CmdAppConf) {
     */
 
   private val extractors = Map[String, List[String] => Any](
+    "AudioInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).audio()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).audio())
+        }
+        save(AudioInformationExtractor(df))
+      }),
     "DomainFrequencyExtractor" ->
       ((inputFiles: List[String]) => {
         var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages()
@@ -119,16 +133,24 @@ class CommandLineApp(conf: CmdAppConf) {
         inputFiles.tail foreach { f =>
           df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webgraph())
         }
-        if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "GEXF") {
+        if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "gexf") {
           new File(saveTarget).mkdirs()
           WriteGEXF(DomainGraphExtractor(df).collect(), Paths.get(saveTarget).toString + "/GEXF.gexf")
-        } else if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "GRAPHML") {
+        } else if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "graphml") {
           new File(saveTarget).mkdirs()
           WriteGraphML(DomainGraphExtractor(df).collect(), Paths.get(saveTarget).toString + "/GRAPHML.graphml")
         } else {
           save(DomainGraphExtractor(df))
         }
       }),
+    "ImageInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).images()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).images())
+        }
+        save(ImageInformationExtractor(df))
+      }),
     "ImageGraphExtractor" ->
       ((inputFiles: List[String]) => {
         var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).imagegraph()
@@ -137,6 +159,14 @@ class CommandLineApp(conf: CmdAppConf) {
         }
         save(ImageGraphExtractor(df))
       }),
+    "PDFInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).pdfs()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).pdfs())
+        }
+        save(PDFInformationExtractor(df))
+      }),
     "PlainTextExtractor" ->
       ((inputFiles: List[String]) => {
         var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages()
@@ -145,6 +175,46 @@ class CommandLineApp(conf: CmdAppConf) {
         }
         save(PlainTextExtractor(df))
       }),
+    "PresentationProgramInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).presentationProgramFiles()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).presentationProgramFiles())
+        }
+        save(PresentationProgramInformationExtractor(df))
+      }),
+    "SpreadsheetInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).spreadsheets()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).spreadsheets())
+        }
+        save(SpreadsheetInformationExtractor(df))
+      }),
+    "TextFilesInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).textFiles()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).textFiles())
+        }
+        save(TextFilesInformationExtractor(df))
+      }),
+    "VideoInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).videos()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).videos())
+        }
+        save(VideoInformationExtractor(df))
+      }),
+    "WebGraphExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webgraph()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webgraph())
+        }
+        save(WebGraphExtractor(df))
+      }),
     "WebPagesExtractor" ->
       ((inputFiles: List[String]) => {
         var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages()
@@ -152,6 +222,14 @@ class CommandLineApp(conf: CmdAppConf) {
           df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webpages())
         }
         save(WebPagesExtractor(df))
+      }),
+    "WordProcessorInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).wordProcessorFiles()
+        inputFiles.tail foreach { f =>
+          df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).wordProcessorFiles())
+        }
+        save(WordProcessorInformationExtractor(df))
       })
   )
 
diff --git a/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala b/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala
index 9e87a488..87b28ca0 100644
--- a/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala
@@ -32,7 +32,6 @@ object DomainFrequencyExtractor {
     // scalastyle:off
     import spark.implicits._
     // scalastyle:on
-
     d.groupBy(ExtractDomainDF($"url").as("domain"))
       .count()
       .sort($"count".desc)
diff --git a/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala b/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala
index 62da97f0..d5647eaf 100644
--- a/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala
@@ -17,7 +17,6 @@
 package io.archivesunleashed.app
 
 import io.archivesunleashed.{ArchiveRecord, DataFrameLoader}
-import org.apache.spark.sql.functions.desc
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object ImageGraphExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala
new file mode 100644
index 00000000..20bbfb2c
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala
@@ -0,0 +1,45 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object ImageInformationExtractor {
+  /** Extract information about images from web archive using DataFrame
+    * and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, url,
+    *   mime_type_web_server, mime_type_tika, width, height, language, content)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    // scalastyle:off
+    import spark.implicits._
+    // scalastyle:on
+    d.select($"url",
+             $"filename",
+             $"extension",
+             $"mime_type_web_server",
+             $"mime_type_tika",
+             $"width",
+             $"height",
+             $"md5",
+             $"sha1")
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala
new file mode 100644
index 00000000..46e61bda
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object PDFInformationExtractor {
+  /** Extract information about PDFs from web archive using DataFrame
+    * and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, url,
+    *   mime_type_web_server, mime_type_tika, language, content)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    // scalastyle:off
+    import spark.implicits._
+    // scalastyle:on
+    d.select($"url",
+             $"filename",
+             $"extension",
+             $"mime_type_web_server",
+             $"mime_type_tika",
+             $"md5",
+             $"sha1")
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala
new file mode 100644
index 00000000..7aa9d02f
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object PresentationProgramInformationExtractor {
+  /** Extract information about presentation program files
+    * from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, url,
+    *   mime_type_web_server, mime_type_tika, language, content)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    // scalastyle:off
+    import spark.implicits._
+    // scalastyle:on
+    d.select($"url",
+             $"filename",
+             $"extension",
+             $"mime_type_web_server",
+             $"mime_type_tika",
+             $"md5",
+             $"sha1")
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala
new file mode 100644
index 00000000..ec153068
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object SpreadsheetInformationExtractor {
+  /** Extract information about spreadsheets from web archive using
+    * DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, url,
+    *   mime_type_web_server, mime_type_tika, language, content)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    // scalastyle:off
+    import spark.implicits._
+    // scalastyle:on
+    d.select($"url",
+             $"filename",
+             $"extension",
+             $"mime_type_web_server",
+             $"mime_type_tika",
+             $"md5",
+             $"sha1")
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/TextFilesInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/TextFilesInformationExtractor.scala
new file mode 100644
index 00000000..8a9e626a
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/TextFilesInformationExtractor.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object TextFilesInformationExtractor {
+  /** Extract information about text files from web archive using
+    * DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, url,
+    *   mime_type_web_server, mime_type_tika, language, content)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    // scalastyle:off
+    import spark.implicits._
+    // scalastyle:on
+    d.select($"url",
+             $"filename",
+             $"extension",
+             $"mime_type_web_server",
+             $"mime_type_tika",
+             $"md5",
+             $"sha1")
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala
new file mode 100644
index 00000000..654a3427
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object VideoInformationExtractor {
+  /** Extract information about videos from web archive using DataFrame
+    * and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, url,
+    *   mime_type_web_server, mime_type_tika, language, content)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    // scalastyle:off
+    import spark.implicits._
+    // scalastyle:on
+    d.select($"url",
+             $"filename",
+             $"extension",
+             $"mime_type_web_server",
+             $"mime_type_tika",
+             $"md5",
+             $"sha1")
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/WebGraphExtractor.scala b/src/main/scala/io/archivesunleashed/app/WebGraphExtractor.scala
new file mode 100644
index 00000000..f8782615
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/WebGraphExtractor.scala
@@ -0,0 +1,34 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.{ArchiveRecord, DataFrameLoader}
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object WebGraphExtractor {
+
+  /** Extract web graph from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, src, image url,
+    *   alt text)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    d
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala b/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala
index 71219151..aaa1e8eb 100644
--- a/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala
@@ -18,7 +18,7 @@ package io.archivesunleashed.app
 
 import io.archivesunleashed.ArchiveRecord
 import io.archivesunleashed.df.{ExtractDomainDF, RemoveHTMLDF,
-                                RemoveHTTPHeaderDF}
+                                RemoveHTTPHeaderDF, RemovePrefixWWWDF}
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object WebPagesExtractor {
@@ -30,6 +30,15 @@ object WebPagesExtractor {
     */
   def apply(d: DataFrame): Dataset[Row] = {
     val spark = SparkSession.builder().master("local").getOrCreate()
-    d
+    // scalastyle:off
+    import spark.implicits._
+    // scalastyle:on
+    d.select($"crawl_date",
+             RemovePrefixWWWDF(ExtractDomainDF($"url")).as("domain"),
+             $"url",
+             $"mime_type_web_server",
+             $"mime_type_tika",
+             $"language",
+             RemoveHTMLDF(RemoveHTTPHeaderDF(($"content"))).alias("content"))
   }
 }
diff --git a/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala
new file mode 100644
index 00000000..3ebc8bb1
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object WordProcessorInformationExtractor {
+  /** Extract information about word processor files from web archive
+    * using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, url,
+    *   mime_type_web_server, mime_type_tika, language, content)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    // scalastyle:off
+    import spark.implicits._
+    // scalastyle:on
+    d.select($"url",
+             $"filename",
+             $"extension",
+             $"mime_type_web_server",
+             $"mime_type_tika",
+             $"md5",
+             $"sha1")
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala
new file mode 100644
index 00000000..ad8730af
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class AudioInformationExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("warc/example.media.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("Audio information extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).audio()
+    val dfResults = AudioInformationExtractor(df).collect()
+    val RESULTSLENGTH = 1
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "https://ruebot.net/files/feniz.mp3")
+    assert(dfResults(0).get(1) == "feniz.mp3")
+    assert(dfResults(0).get(2) == "mp3")
+    assert(dfResults(0).get(3) == "audio/mpeg")
+    assert(dfResults(0).get(4) == "audio/mpeg")
+    assert(dfResults(0).get(5) == "f7e7ec84b12c294e19af1ba41732c733")
+    assert(dfResults(0).get(6) == "a3eb95dbbea76460529d0d9ebdde5faabaff544a")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala
index ff96a583..9ebd02d9 100644
--- a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala
@@ -54,7 +54,25 @@ class CommandLineAppTest extends FunSuite with BeforeAndAfter {
     Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt),
     Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--partition", "1"),
     Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt),
-    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--partition", "1")
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--partition", "1"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--partition", "1"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--partition", "1"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--partition", "1"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--partition", "1"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--partition", "1"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor", "--partition", "1"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--partition", "1"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--partition", "1"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor"),
+    Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--partition", "1")
   )
 
   private val testFailCmds = Array(
diff --git a/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala
new file mode 100644
index 00000000..829cbc5d
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala
@@ -0,0 +1,63 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class ImageInformationExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("warc/example.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("Image information extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).images()
+    val dfResults = ImageInformationExtractor(df).collect()
+    val RESULTSLENGTH = 55
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "http://www.archive.org/images/logoc.jpg")
+    assert(dfResults(0).get(1) == "logoc.jpg")
+    assert(dfResults(0).get(2) == "jpg")
+    assert(dfResults(0).get(3) == "image/jpeg")
+    assert(dfResults(0).get(4) == "image/jpeg")
+    assert(dfResults(0).get(5) == 70)
+    assert(dfResults(0).get(6) == 56)
+    assert(dfResults(0).get(7) == "8211d1fbb9b03d8522a1ae378f9d1b24")
+    assert(dfResults(0).get(8) == "a671e68fc211ee4996a91e99297f246b2c5faa1a")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala
new file mode 100644
index 00000000..9ef66f7b
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class PDFInformationExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("warc/example.pdf.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("PDF information extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).pdfs()
+    val dfResults = PDFInformationExtractor(df).collect()
+    val RESULTSLENGTH = 2
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y")
+    assert(dfResults(0).get(1) == "cost-analysis.pdf")
+    assert(dfResults(0).get(2) == "pdf")
+    assert(dfResults(0).get(3) == "application/pdf")
+    assert(dfResults(0).get(4) == "application/pdf")
+    assert(dfResults(0).get(5) == "aaba59d2287afd40c996488a39bbc0dd")
+    assert(dfResults(0).get(6) == "569c28e0e8faa6945d6ca88fcd9e195825052c71")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala
new file mode 100644
index 00000000..8be73740
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class PresentationProgramInformationExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("warc/example.docs.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("Presentation program information extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).presentationProgramFiles()
+    val dfResults = PresentationProgramInformationExtractor(df).collect()
+    val RESULTSLENGTH = 2
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx")
+    assert(dfResults(0).get(1) == "aut-test-fixtures.pptx")
+    assert(dfResults(0).get(2) == "pptx")
+    assert(dfResults(0).get(3) == "application/vnd.openxmlformats-officedocument.presentationml.presentation")
+    assert(dfResults(0).get(4) == "application/vnd.openxmlformats-officedocument.presentationml.presentation")
+    assert(dfResults(0).get(5) == "7a7b1fe4b6d311376eaced9de3b682ee")
+    assert(dfResults(0).get(6) == "86fadca47b134b68247ccde62da4ce3f62b4d2ec")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala
new file mode 100644
index 00000000..0754812f
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class SpreadsheetInformationExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("warc/example.docs.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("Spreadsheet information extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).spreadsheets()
+    val dfResults = SpreadsheetInformationExtractor(df).collect()
+    val RESULTSLENGTH = 4
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods")
+    assert(dfResults(0).get(1) == "test-aut-fixture.ods")
+    assert(dfResults(0).get(2) == "ods")
+    assert(dfResults(0).get(3) == "application/vnd.oasis.opendocument.spreadsheet")
+    assert(dfResults(0).get(4) == "application/vnd.oasis.opendocument.spreadsheet")
+    assert(dfResults(0).get(5) == "7f70280757d8beb2d1bfd6fb1b6ae6e9")
+    assert(dfResults(0).get(6) == "448c357e78317877a98a399448031a89f1dda6fb")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/app/TextFilesInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/TextFilesInformationExtractorTest.scala
new file mode 100644
index 00000000..0d481767
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/TextFilesInformationExtractorTest.scala
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class TextFilesInformationExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("warc/example.txt.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("Text files information extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).textFiles()
+    val dfResults = TextFilesInformationExtractor(df).collect()
+    val RESULTSLENGTH = 1
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "https://ruebot.net/files/aut-test-fixtures/aut-text.txt")
+    assert(dfResults(0).get(1) == "aut-text.txt")
+    assert(dfResults(0).get(2) == "txt")
+    assert(dfResults(0).get(3) == "text/plain")
+    assert(dfResults(0).get(4) == "application/gzip")
+    assert(dfResults(0).get(5) == "32abd404fb560ecf14b75611f3cc5c2c")
+    assert(dfResults(0).get(6) == "9dc9d163d933085348e90cd2b6e523e3139d3e88")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala
new file mode 100644
index 00000000..1d4cec03
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class VideoInformationExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("warc/example.media.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("Video information extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).videos()
+    val dfResults = VideoInformationExtractor(df).collect()
+    val RESULTSLENGTH = 1
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "https://ruebot.net/2018-11-12%2016.14.11.mp4")
+    assert(dfResults(0).get(1) == "2018-11-12%2016.14.11.mp4")
+    assert(dfResults(0).get(2) == "mp4")
+    assert(dfResults(0).get(3) == "video/mp4")
+    assert(dfResults(0).get(4) == "video/mp4")
+    assert(dfResults(0).get(5) == "2cde7de3213a87269957033f6315fce2")
+    assert(dfResults(0).get(6) == "f28c72fa4c0464a1a2b81fdc539b28cf574ac4c2")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/app/WebGraphExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/WebGraphExtractorTest.scala
new file mode 100644
index 00000000..96d04894
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/WebGraphExtractorTest.scala
@@ -0,0 +1,58 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class WebGraphExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("warc/example.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("Web graph extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).webgraph()
+    val dfResults = WebGraphExtractor(df).collect()
+    val RESULTSLENGTH = 622
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "20080430")
+    assert(dfResults(0).get(1) == "http://www.archive.org/")
+    assert(dfResults(0).get(2) == "http://www.archive.org")
+    assert(dfResults(0).get(3) == "http://www.archive.org")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/app/WebPagesExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/WebPagesExtractorTest.scala
index 85807d07..a1be8ed1 100644
--- a/src/test/scala/io/archivesunleashed/app/WebPagesExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/WebPagesExtractorTest.scala
@@ -45,10 +45,11 @@ class WebPagesExtractorTest extends FunSuite with BeforeAndAfter {
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20080430")
-    assert(dfResults(0).get(1) == "http://www.archive.org/")
-    assert(dfResults(0).get(2) == "text/html")
+    assert(dfResults(0).get(1) == "archive.org")
+    assert(dfResults(0).get(2) == "http://www.archive.org/")
     assert(dfResults(0).get(3) == "text/html")
-    assert(dfResults(0).get(4) == "en")
+    assert(dfResults(0).get(4) == "text/html")
+    assert(dfResults(0).get(5) == "en")
   }
 
   after {
diff --git a/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala
new file mode 100644
index 00000000..2668d713
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class WordProcessorInformationExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("warc/example.docs.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("Word processor information extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).wordProcessorFiles()
+    val dfResults = WordProcessorInformationExtractor(df).collect()
+    val RESULTSLENGTH = 3
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf")
+    assert(dfResults(0).get(1) == "test-aut-fixtures.rtf")
+    assert(dfResults(0).get(2) == "rtf")
+    assert(dfResults(0).get(3) == "application/rtf")
+    assert(dfResults(0).get(4) == "application/rtf")
+    assert(dfResults(0).get(5) == "e483512b65ba44d71e843c57de2adeb7")
+    assert(dfResults(0).get(6) == "8cf3066421f0a07fcd6e7a3e86ebd447edf7cfcb")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}