Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a number of additional app extractors. #451

Merged
merged 2 commits into from
Apr 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Copyright © 2017 The Archives Unleashed Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object AudioInformationExtractor {
/** Extract information about audio files from web archive using
* DataFrame and Spark SQL.
*
* @param d DataFrame obtained from RecordLoader
* @return Dataset[Row], where the schema is (crawl date, url,
* mime_type_web_server, mime_type_tika, language, content)
*/
def apply(d: DataFrame): Dataset[Row] = {
val spark = SparkSession.builder().master("local").getOrCreate()
// scalastyle:off
import spark.implicits._
// scalastyle:on
d.select($"url",
$"filename",
$"extension",
$"mime_type_web_server",
$"mime_type_tika",
$"md5",
$"sha1")
}
}
98 changes: 88 additions & 10 deletions src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
Original file line number Diff line number Diff line change
Expand Up @@ -34,26 +34,32 @@ import org.rogach.scallop.ScallopConf
* --input INPUT_FILE ...
* --output OUTPUT_DIRECTORY
* [--output-format FORMAT]
* [--df]
* [--split]
* [--partiton]
*
* where EXTRACTOR is one of
* DomainFrequencyExtractor, DomainGraphExtractor or PlainTextExtractor
* AudioInformationExtractor, DomainFrequencyExtractor, DomainGraphExtractor,
* ImageGraphExtractor, ImageInformationExtractor, PDFInformationExtractor,
* PlainTextExtractor, PresentationProgramInformationExtractor,
* SpreadsheetInformationExtractor, TextFilesInformationExtractor,
* VideoInformationExtractor, WebGraphExtractor, WebPagesExtractor,
* or WordProcessorInformationExtractor.
*
* INPUT_FILE is a list of input files separated by space (or path containing wildcard)
* OUTPUT_DIRECTORY is the directory to put result in
*
* FORMAT is meant to work with DomainGraphExtractor
* Three supported options are CSV (default), GEXF, or GRAPHML
* Four supported options are csv (default), and gexf, graphml as
* additional options for DomainGraphExtractor.
*
* If --split is present, the program will put results for each input file in its own folder. Otherwise they will be merged.
* If --split is present, the program will put results for each input file in its own folder.
* Otherwise they will be merged.
*
* If --partition N is present, the program will partition the DataFrame according to N before writing results.
* Otherwise, the partition is left as is.
* If --partition N is present, the program will partition the DataFrame according
* to N before writing results. Otherwise, the partition is left as is.
*/

/** Construct a Scallop option reader from command line argument string list
/** Construct a Scallop option reader from command line argument string list.
*
* @param args list of command line arguments passed as is from argv
*/
Expand All @@ -79,7 +85,7 @@ class CmdAppConf(args: Seq[String]) extends ScallopConf(args) {
val input = opt[List[String]](descr = "input file path", required = true)
val output = opt[String](descr = "output directory path", required = true)
val outputFormat = opt[String](descr =
"output format for DomainGraphExtractor, one of CSV, GEXF, or GRAPHML")
"output format for DomainGraphExtractor, one of csv, gexf, or graphml")
val split = opt[Boolean]()
val partition = opt[Int]()
verify()
Expand All @@ -105,6 +111,14 @@ class CommandLineApp(conf: CmdAppConf) {
*/

private val extractors = Map[String, List[String] => Any](
"AudioInformationExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).audio()
inputFiles.tail foreach { f =>
df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).audio())
}
save(AudioInformationExtractor(df))
}),
"DomainFrequencyExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages()
Expand All @@ -119,16 +133,24 @@ class CommandLineApp(conf: CmdAppConf) {
inputFiles.tail foreach { f =>
df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webgraph())
}
if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "GEXF") {
if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "gexf") {
new File(saveTarget).mkdirs()
WriteGEXF(DomainGraphExtractor(df).collect(), Paths.get(saveTarget).toString + "/GEXF.gexf")
} else if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "GRAPHML") {
} else if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "graphml") {
new File(saveTarget).mkdirs()
WriteGraphML(DomainGraphExtractor(df).collect(), Paths.get(saveTarget).toString + "/GRAPHML.graphml")
} else {
save(DomainGraphExtractor(df))
}
}),
"ImageInformationExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).images()
inputFiles.tail foreach { f =>
df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).images())
}
save(ImageInformationExtractor(df))
}),
"ImageGraphExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).imagegraph()
Expand All @@ -137,6 +159,14 @@ class CommandLineApp(conf: CmdAppConf) {
}
save(ImageGraphExtractor(df))
}),
"PDFInformationExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).pdfs()
inputFiles.tail foreach { f =>
df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).pdfs())
}
save(PDFInformationExtractor(df))
}),
"PlainTextExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages()
Expand All @@ -145,13 +175,61 @@ class CommandLineApp(conf: CmdAppConf) {
}
save(PlainTextExtractor(df))
}),
"PresentationProgramInformationExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).presentationProgramFiles()
inputFiles.tail foreach { f =>
df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).presentationProgramFiles())
}
save(PresentationProgramInformationExtractor(df))
}),
"SpreadsheetInformationExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).spreadsheets()
inputFiles.tail foreach { f =>
df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).spreadsheets())
}
save(SpreadsheetInformationExtractor(df))
}),
"TextFilesInformationExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).textFiles()
inputFiles.tail foreach { f =>
df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).textFiles())
}
save(TextFilesInformationExtractor(df))
}),
"VideoInformationExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).videos()
inputFiles.tail foreach { f =>
df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).videos())
}
save(VideoInformationExtractor(df))
}),
"WebGraphExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webgraph()
inputFiles.tail foreach { f =>
df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webgraph())
}
save(WebGraphExtractor(df))
}),
"WebPagesExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages()
inputFiles.tail foreach { f =>
df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webpages())
}
save(WebPagesExtractor(df))
}),
"WordProcessorInformationExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).wordProcessorFiles()
inputFiles.tail foreach { f =>
df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).wordProcessorFiles())
}
save(WordProcessorInformationExtractor(df))
})
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ object DomainFrequencyExtractor {
// scalastyle:off
import spark.implicits._
// scalastyle:on

d.groupBy(ExtractDomainDF($"url").as("domain"))
.count()
.sort($"count".desc)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
package io.archivesunleashed.app

import io.archivesunleashed.{ArchiveRecord, DataFrameLoader}
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object ImageGraphExtractor {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Copyright © 2017 The Archives Unleashed Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object ImageInformationExtractor {
/** Extract information about images from web archive using DataFrame
* and Spark SQL.
*
* @param d DataFrame obtained from RecordLoader
* @return Dataset[Row], where the schema is (crawl date, url,
* mime_type_web_server, mime_type_tika, width, height, language, content)
*/
def apply(d: DataFrame): Dataset[Row] = {
val spark = SparkSession.builder().master("local").getOrCreate()
// scalastyle:off
import spark.implicits._
// scalastyle:on
d.select($"url",
$"filename",
$"extension",
$"mime_type_web_server",
$"mime_type_tika",
$"width",
$"height",
$"md5",
$"sha1")
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Copyright © 2017 The Archives Unleashed Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object PDFInformationExtractor {
/** Extract information about PDFs from web archive using DataFrame
* and Spark SQL.
*
* @param d DataFrame obtained from RecordLoader
* @return Dataset[Row], where the schema is (crawl date, url,
* mime_type_web_server, mime_type_tika, language, content)
*/
def apply(d: DataFrame): Dataset[Row] = {
val spark = SparkSession.builder().master("local").getOrCreate()
// scalastyle:off
import spark.implicits._
// scalastyle:on
d.select($"url",
$"filename",
$"extension",
$"mime_type_web_server",
$"mime_type_tika",
$"md5",
$"sha1")
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Copyright © 2017 The Archives Unleashed Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object PresentationProgramInformationExtractor {
/** Extract information about presentation program files
* from web archive using DataFrame and Spark SQL.
*
* @param d DataFrame obtained from RecordLoader
* @return Dataset[Row], where the schema is (crawl date, url,
* mime_type_web_server, mime_type_tika, language, content)
*/
def apply(d: DataFrame): Dataset[Row] = {
val spark = SparkSession.builder().master("local").getOrCreate()
// scalastyle:off
import spark.implicits._
// scalastyle:on
d.select($"url",
$"filename",
$"extension",
$"mime_type_web_server",
$"mime_type_tika",
$"md5",
$"sha1")
}
}
Loading