diff --git a/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala new file mode 100644 index 00000000..d0c19926 --- /dev/null +++ b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala @@ -0,0 +1,43 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import io.archivesunleashed.ArchiveRecord +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} + +object AudioInformationExtractor { + /** Extract information about audio files from web archive using + * DataFrame and Spark SQL. + * + * @param d DataFrame obtained from RecordLoader + * @return Dataset[Row], where the schema is (crawl date, url, + * mime_type_web_server, mime_type_tika, language, content) + */ + def apply(d: DataFrame): Dataset[Row] = { + val spark = SparkSession.builder().master("local").getOrCreate() + // scalastyle:off + import spark.implicits._ + // scalastyle:on + d.select($"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"md5", + $"sha1") + } +} diff --git a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala index 2838b96d..18f96866 100644 --- a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala +++ b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala @@ -34,26 +34,32 @@ import org.rogach.scallop.ScallopConf * --input INPUT_FILE ... * --output OUTPUT_DIRECTORY * [--output-format FORMAT] - * [--df] * [--split] * [--partiton] * * where EXTRACTOR is one of - * DomainFrequencyExtractor, DomainGraphExtractor or PlainTextExtractor + * AudioInformationExtractor, DomainFrequencyExtractor, DomainGraphExtractor, + * ImageGraphExtractor, ImageInformationExtractor, PDFInformationExtractor, + * PlainTextExtractor, PresentationProgramInformationExtractor, + * SpreadsheetInformationExtractor, TextFilesInformationExtractor, + * VideoInformationExtractor, WebGraphExtractor, WebPagesExtractor, + * or WordProcessorInformationExtractor. * * INPUT_FILE is a list of input files separated by space (or path containing wildcard) * OUTPUT_DIRECTORY is the directory to put result in * * FORMAT is meant to work with DomainGraphExtractor - * Three supported options are CSV (default), GEXF, or GRAPHML + * Four supported options are csv (default), and gexf, graphml as + * additional options for DomainGraphExtractor. * - * If --split is present, the program will put results for each input file in its own folder. Otherwise they will be merged. + * If --split is present, the program will put results for each input file in its own folder. + * Otherwise they will be merged. * - * If --partition N is present, the program will partition the DataFrame according to N before writing results. - * Otherwise, the partition is left as is. + * If --partition N is present, the program will partition the DataFrame according + * to N before writing results. Otherwise, the partition is left as is. */ -/** Construct a Scallop option reader from command line argument string list +/** Construct a Scallop option reader from command line argument string list. * * @param args list of command line arguments passed as is from argv */ @@ -79,7 +85,7 @@ class CmdAppConf(args: Seq[String]) extends ScallopConf(args) { val input = opt[List[String]](descr = "input file path", required = true) val output = opt[String](descr = "output directory path", required = true) val outputFormat = opt[String](descr = - "output format for DomainGraphExtractor, one of CSV, GEXF, or GRAPHML") + "output format for DomainGraphExtractor, one of csv, gexf, or graphml") val split = opt[Boolean]() val partition = opt[Int]() verify() @@ -105,6 +111,14 @@ class CommandLineApp(conf: CmdAppConf) { */ private val extractors = Map[String, List[String] => Any]( + "AudioInformationExtractor" -> + ((inputFiles: List[String]) => { + var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).audio() + inputFiles.tail foreach { f => + df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).audio()) + } + save(AudioInformationExtractor(df)) + }), "DomainFrequencyExtractor" -> ((inputFiles: List[String]) => { var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages() @@ -119,16 +133,24 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webgraph()) } - if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "GEXF") { + if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "gexf") { new File(saveTarget).mkdirs() WriteGEXF(DomainGraphExtractor(df).collect(), Paths.get(saveTarget).toString + "/GEXF.gexf") - } else if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "GRAPHML") { + } else if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "graphml") { new File(saveTarget).mkdirs() WriteGraphML(DomainGraphExtractor(df).collect(), Paths.get(saveTarget).toString + "/GRAPHML.graphml") } else { save(DomainGraphExtractor(df)) } }), + "ImageInformationExtractor" -> + ((inputFiles: List[String]) => { + var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).images() + inputFiles.tail foreach { f => + df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).images()) + } + save(ImageInformationExtractor(df)) + }), "ImageGraphExtractor" -> ((inputFiles: List[String]) => { var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).imagegraph() @@ -137,6 +159,14 @@ class CommandLineApp(conf: CmdAppConf) { } save(ImageGraphExtractor(df)) }), + "PDFInformationExtractor" -> + ((inputFiles: List[String]) => { + var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).pdfs() + inputFiles.tail foreach { f => + df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).pdfs()) + } + save(PDFInformationExtractor(df)) + }), "PlainTextExtractor" -> ((inputFiles: List[String]) => { var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages() @@ -145,6 +175,46 @@ class CommandLineApp(conf: CmdAppConf) { } save(PlainTextExtractor(df)) }), + "PresentationProgramInformationExtractor" -> + ((inputFiles: List[String]) => { + var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).presentationProgramFiles() + inputFiles.tail foreach { f => + df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).presentationProgramFiles()) + } + save(PresentationProgramInformationExtractor(df)) + }), + "SpreadsheetInformationExtractor" -> + ((inputFiles: List[String]) => { + var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).spreadsheets() + inputFiles.tail foreach { f => + df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).spreadsheets()) + } + save(SpreadsheetInformationExtractor(df)) + }), + "TextFilesInformationExtractor" -> + ((inputFiles: List[String]) => { + var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).textFiles() + inputFiles.tail foreach { f => + df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).textFiles()) + } + save(TextFilesInformationExtractor(df)) + }), + "VideoInformationExtractor" -> + ((inputFiles: List[String]) => { + var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).videos() + inputFiles.tail foreach { f => + df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).videos()) + } + save(VideoInformationExtractor(df)) + }), + "WebGraphExtractor" -> + ((inputFiles: List[String]) => { + var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webgraph() + inputFiles.tail foreach { f => + df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webgraph()) + } + save(WebGraphExtractor(df)) + }), "WebPagesExtractor" -> ((inputFiles: List[String]) => { var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages() @@ -152,6 +222,14 @@ class CommandLineApp(conf: CmdAppConf) { df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webpages()) } save(WebPagesExtractor(df)) + }), + "WordProcessorInformationExtractor" -> + ((inputFiles: List[String]) => { + var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).wordProcessorFiles() + inputFiles.tail foreach { f => + df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).wordProcessorFiles()) + } + save(WordProcessorInformationExtractor(df)) }) ) diff --git a/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala b/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala index 9e87a488..87b28ca0 100644 --- a/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala @@ -32,7 +32,6 @@ object DomainFrequencyExtractor { // scalastyle:off import spark.implicits._ // scalastyle:on - d.groupBy(ExtractDomainDF($"url").as("domain")) .count() .sort($"count".desc) diff --git a/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala b/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala index 62da97f0..d5647eaf 100644 --- a/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala @@ -17,7 +17,6 @@ package io.archivesunleashed.app import io.archivesunleashed.{ArchiveRecord, DataFrameLoader} -import org.apache.spark.sql.functions.desc import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object ImageGraphExtractor { diff --git a/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala new file mode 100644 index 00000000..20bbfb2c --- /dev/null +++ b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala @@ -0,0 +1,45 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import io.archivesunleashed.ArchiveRecord +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} + +object ImageInformationExtractor { + /** Extract information about images from web archive using DataFrame + * and Spark SQL. + * + * @param d DataFrame obtained from RecordLoader + * @return Dataset[Row], where the schema is (crawl date, url, + * mime_type_web_server, mime_type_tika, width, height, language, content) + */ + def apply(d: DataFrame): Dataset[Row] = { + val spark = SparkSession.builder().master("local").getOrCreate() + // scalastyle:off + import spark.implicits._ + // scalastyle:on + d.select($"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"width", + $"height", + $"md5", + $"sha1") + } +} diff --git a/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala new file mode 100644 index 00000000..46e61bda --- /dev/null +++ b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala @@ -0,0 +1,43 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import io.archivesunleashed.ArchiveRecord +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} + +object PDFInformationExtractor { + /** Extract information about PDFs from web archive using DataFrame + * and Spark SQL. + * + * @param d DataFrame obtained from RecordLoader + * @return Dataset[Row], where the schema is (crawl date, url, + * mime_type_web_server, mime_type_tika, language, content) + */ + def apply(d: DataFrame): Dataset[Row] = { + val spark = SparkSession.builder().master("local").getOrCreate() + // scalastyle:off + import spark.implicits._ + // scalastyle:on + d.select($"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"md5", + $"sha1") + } +} diff --git a/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala new file mode 100644 index 00000000..7aa9d02f --- /dev/null +++ b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala @@ -0,0 +1,43 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import io.archivesunleashed.ArchiveRecord +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} + +object PresentationProgramInformationExtractor { + /** Extract information about presentation program files + * from web archive using DataFrame and Spark SQL. + * + * @param d DataFrame obtained from RecordLoader + * @return Dataset[Row], where the schema is (crawl date, url, + * mime_type_web_server, mime_type_tika, language, content) + */ + def apply(d: DataFrame): Dataset[Row] = { + val spark = SparkSession.builder().master("local").getOrCreate() + // scalastyle:off + import spark.implicits._ + // scalastyle:on + d.select($"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"md5", + $"sha1") + } +} diff --git a/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala new file mode 100644 index 00000000..ec153068 --- /dev/null +++ b/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala @@ -0,0 +1,43 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import io.archivesunleashed.ArchiveRecord +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} + +object SpreadsheetInformationExtractor { + /** Extract information about spreadsheets from web archive using + * DataFrame and Spark SQL. + * + * @param d DataFrame obtained from RecordLoader + * @return Dataset[Row], where the schema is (crawl date, url, + * mime_type_web_server, mime_type_tika, language, content) + */ + def apply(d: DataFrame): Dataset[Row] = { + val spark = SparkSession.builder().master("local").getOrCreate() + // scalastyle:off + import spark.implicits._ + // scalastyle:on + d.select($"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"md5", + $"sha1") + } +} diff --git a/src/main/scala/io/archivesunleashed/app/TextFilesInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/TextFilesInformationExtractor.scala new file mode 100644 index 00000000..8a9e626a --- /dev/null +++ b/src/main/scala/io/archivesunleashed/app/TextFilesInformationExtractor.scala @@ -0,0 +1,43 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import io.archivesunleashed.ArchiveRecord +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} + +object TextFilesInformationExtractor { + /** Extract information about text files from web archive using + * DataFrame and Spark SQL. + * + * @param d DataFrame obtained from RecordLoader + * @return Dataset[Row], where the schema is (crawl date, url, + * mime_type_web_server, mime_type_tika, language, content) + */ + def apply(d: DataFrame): Dataset[Row] = { + val spark = SparkSession.builder().master("local").getOrCreate() + // scalastyle:off + import spark.implicits._ + // scalastyle:on + d.select($"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"md5", + $"sha1") + } +} diff --git a/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala new file mode 100644 index 00000000..654a3427 --- /dev/null +++ b/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala @@ -0,0 +1,43 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import io.archivesunleashed.ArchiveRecord +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} + +object VideoInformationExtractor { + /** Extract information about videos from web archive using DataFrame + * and Spark SQL. + * + * @param d DataFrame obtained from RecordLoader + * @return Dataset[Row], where the schema is (crawl date, url, + * mime_type_web_server, mime_type_tika, language, content) + */ + def apply(d: DataFrame): Dataset[Row] = { + val spark = SparkSession.builder().master("local").getOrCreate() + // scalastyle:off + import spark.implicits._ + // scalastyle:on + d.select($"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"md5", + $"sha1") + } +} diff --git a/src/main/scala/io/archivesunleashed/app/WebGraphExtractor.scala b/src/main/scala/io/archivesunleashed/app/WebGraphExtractor.scala new file mode 100644 index 00000000..f8782615 --- /dev/null +++ b/src/main/scala/io/archivesunleashed/app/WebGraphExtractor.scala @@ -0,0 +1,34 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import io.archivesunleashed.{ArchiveRecord, DataFrameLoader} +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} + +object WebGraphExtractor { + + /** Extract web graph from web archive using DataFrame and Spark SQL. + * + * @param d DataFrame obtained from RecordLoader + * @return Dataset[Row], where the schema is (crawl date, src, image url, + * alt text) + */ + def apply(d: DataFrame): Dataset[Row] = { + val spark = SparkSession.builder().master("local").getOrCreate() + d + } +} diff --git a/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala b/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala index 71219151..aaa1e8eb 100644 --- a/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala @@ -18,7 +18,7 @@ package io.archivesunleashed.app import io.archivesunleashed.ArchiveRecord import io.archivesunleashed.df.{ExtractDomainDF, RemoveHTMLDF, - RemoveHTTPHeaderDF} + RemoveHTTPHeaderDF, RemovePrefixWWWDF} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object WebPagesExtractor { @@ -30,6 +30,15 @@ object WebPagesExtractor { */ def apply(d: DataFrame): Dataset[Row] = { val spark = SparkSession.builder().master("local").getOrCreate() - d + // scalastyle:off + import spark.implicits._ + // scalastyle:on + d.select($"crawl_date", + RemovePrefixWWWDF(ExtractDomainDF($"url")).as("domain"), + $"url", + $"mime_type_web_server", + $"mime_type_tika", + $"language", + RemoveHTMLDF(RemoveHTTPHeaderDF(($"content"))).alias("content")) } } diff --git a/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala new file mode 100644 index 00000000..3ebc8bb1 --- /dev/null +++ b/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala @@ -0,0 +1,43 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import io.archivesunleashed.ArchiveRecord +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} + +object WordProcessorInformationExtractor { + /** Extract information about word processor files from web archive + * using DataFrame and Spark SQL. + * + * @param d DataFrame obtained from RecordLoader + * @return Dataset[Row], where the schema is (crawl date, url, + * mime_type_web_server, mime_type_tika, language, content) + */ + def apply(d: DataFrame): Dataset[Row] = { + val spark = SparkSession.builder().master("local").getOrCreate() + // scalastyle:off + import spark.implicits._ + // scalastyle:on + d.select($"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"md5", + $"sha1") + } +} diff --git a/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala new file mode 100644 index 00000000..ad8730af --- /dev/null +++ b/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala @@ -0,0 +1,61 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import com.google.common.io.Resources +import io.archivesunleashed.RecordLoader +import org.apache.spark.{SparkConf, SparkContext} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{BeforeAndAfter, FunSuite} + +@RunWith(classOf[JUnitRunner]) +class AudioInformationExtractorTest extends FunSuite with BeforeAndAfter { + private val arcPath = Resources.getResource("warc/example.media.warc.gz").getPath + private var sc: SparkContext = _ + private val master = "local[4]" + private val appName = "example-spark" + + before { + val conf = new SparkConf() + .setMaster(master) + .setAppName(appName) + conf.set("spark.driver.allowMultipleContexts", "true") + sc = new SparkContext(conf) + } + + test("Audio information extractor DF") { + val df = RecordLoader.loadArchives(arcPath, sc).audio() + val dfResults = AudioInformationExtractor(df).collect() + val RESULTSLENGTH = 1 + + assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "https://ruebot.net/files/feniz.mp3") + assert(dfResults(0).get(1) == "feniz.mp3") + assert(dfResults(0).get(2) == "mp3") + assert(dfResults(0).get(3) == "audio/mpeg") + assert(dfResults(0).get(4) == "audio/mpeg") + assert(dfResults(0).get(5) == "f7e7ec84b12c294e19af1ba41732c733") + assert(dfResults(0).get(6) == "a3eb95dbbea76460529d0d9ebdde5faabaff544a") + } + + after { + if (sc != null) { + sc.stop() + } + } +} diff --git a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala index ff96a583..9ebd02d9 100644 --- a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala +++ b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala @@ -54,7 +54,25 @@ class CommandLineAppTest extends FunSuite with BeforeAndAfter { Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--partition", "1") + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--partition", "1") ) private val testFailCmds = Array( diff --git a/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala new file mode 100644 index 00000000..829cbc5d --- /dev/null +++ b/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala @@ -0,0 +1,63 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import com.google.common.io.Resources +import io.archivesunleashed.RecordLoader +import org.apache.spark.{SparkConf, SparkContext} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{BeforeAndAfter, FunSuite} + +@RunWith(classOf[JUnitRunner]) +class ImageInformationExtractorTest extends FunSuite with BeforeAndAfter { + private val arcPath = Resources.getResource("warc/example.warc.gz").getPath + private var sc: SparkContext = _ + private val master = "local[4]" + private val appName = "example-spark" + + before { + val conf = new SparkConf() + .setMaster(master) + .setAppName(appName) + conf.set("spark.driver.allowMultipleContexts", "true") + sc = new SparkContext(conf) + } + + test("Image information extractor DF") { + val df = RecordLoader.loadArchives(arcPath, sc).images() + val dfResults = ImageInformationExtractor(df).collect() + val RESULTSLENGTH = 55 + + assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "http://www.archive.org/images/logoc.jpg") + assert(dfResults(0).get(1) == "logoc.jpg") + assert(dfResults(0).get(2) == "jpg") + assert(dfResults(0).get(3) == "image/jpeg") + assert(dfResults(0).get(4) == "image/jpeg") + assert(dfResults(0).get(5) == 70) + assert(dfResults(0).get(6) == 56) + assert(dfResults(0).get(7) == "8211d1fbb9b03d8522a1ae378f9d1b24") + assert(dfResults(0).get(8) == "a671e68fc211ee4996a91e99297f246b2c5faa1a") + } + + after { + if (sc != null) { + sc.stop() + } + } +} diff --git a/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala new file mode 100644 index 00000000..9ef66f7b --- /dev/null +++ b/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala @@ -0,0 +1,61 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import com.google.common.io.Resources +import io.archivesunleashed.RecordLoader +import org.apache.spark.{SparkConf, SparkContext} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{BeforeAndAfter, FunSuite} + +@RunWith(classOf[JUnitRunner]) +class PDFInformationExtractorTest extends FunSuite with BeforeAndAfter { + private val arcPath = Resources.getResource("warc/example.pdf.warc.gz").getPath + private var sc: SparkContext = _ + private val master = "local[4]" + private val appName = "example-spark" + + before { + val conf = new SparkConf() + .setMaster(master) + .setAppName(appName) + conf.set("spark.driver.allowMultipleContexts", "true") + sc = new SparkContext(conf) + } + + test("PDF information extractor DF") { + val df = RecordLoader.loadArchives(arcPath, sc).pdfs() + val dfResults = PDFInformationExtractor(df).collect() + val RESULTSLENGTH = 2 + + assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y") + assert(dfResults(0).get(1) == "cost-analysis.pdf") + assert(dfResults(0).get(2) == "pdf") + assert(dfResults(0).get(3) == "application/pdf") + assert(dfResults(0).get(4) == "application/pdf") + assert(dfResults(0).get(5) == "aaba59d2287afd40c996488a39bbc0dd") + assert(dfResults(0).get(6) == "569c28e0e8faa6945d6ca88fcd9e195825052c71") + } + + after { + if (sc != null) { + sc.stop() + } + } +} diff --git a/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala new file mode 100644 index 00000000..8be73740 --- /dev/null +++ b/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala @@ -0,0 +1,61 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import com.google.common.io.Resources +import io.archivesunleashed.RecordLoader +import org.apache.spark.{SparkConf, SparkContext} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{BeforeAndAfter, FunSuite} + +@RunWith(classOf[JUnitRunner]) +class PresentationProgramInformationExtractorTest extends FunSuite with BeforeAndAfter { + private val arcPath = Resources.getResource("warc/example.docs.warc.gz").getPath + private var sc: SparkContext = _ + private val master = "local[4]" + private val appName = "example-spark" + + before { + val conf = new SparkConf() + .setMaster(master) + .setAppName(appName) + conf.set("spark.driver.allowMultipleContexts", "true") + sc = new SparkContext(conf) + } + + test("Presentation program information extractor DF") { + val df = RecordLoader.loadArchives(arcPath, sc).presentationProgramFiles() + val dfResults = PresentationProgramInformationExtractor(df).collect() + val RESULTSLENGTH = 2 + + assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx") + assert(dfResults(0).get(1) == "aut-test-fixtures.pptx") + assert(dfResults(0).get(2) == "pptx") + assert(dfResults(0).get(3) == "application/vnd.openxmlformats-officedocument.presentationml.presentation") + assert(dfResults(0).get(4) == "application/vnd.openxmlformats-officedocument.presentationml.presentation") + assert(dfResults(0).get(5) == "7a7b1fe4b6d311376eaced9de3b682ee") + assert(dfResults(0).get(6) == "86fadca47b134b68247ccde62da4ce3f62b4d2ec") + } + + after { + if (sc != null) { + sc.stop() + } + } +} diff --git a/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala new file mode 100644 index 00000000..0754812f --- /dev/null +++ b/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala @@ -0,0 +1,61 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import com.google.common.io.Resources +import io.archivesunleashed.RecordLoader +import org.apache.spark.{SparkConf, SparkContext} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{BeforeAndAfter, FunSuite} + +@RunWith(classOf[JUnitRunner]) +class SpreadsheetInformationExtractorTest extends FunSuite with BeforeAndAfter { + private val arcPath = Resources.getResource("warc/example.docs.warc.gz").getPath + private var sc: SparkContext = _ + private val master = "local[4]" + private val appName = "example-spark" + + before { + val conf = new SparkConf() + .setMaster(master) + .setAppName(appName) + conf.set("spark.driver.allowMultipleContexts", "true") + sc = new SparkContext(conf) + } + + test("Spreadsheet information extractor DF") { + val df = RecordLoader.loadArchives(arcPath, sc).spreadsheets() + val dfResults = SpreadsheetInformationExtractor(df).collect() + val RESULTSLENGTH = 4 + + assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods") + assert(dfResults(0).get(1) == "test-aut-fixture.ods") + assert(dfResults(0).get(2) == "ods") + assert(dfResults(0).get(3) == "application/vnd.oasis.opendocument.spreadsheet") + assert(dfResults(0).get(4) == "application/vnd.oasis.opendocument.spreadsheet") + assert(dfResults(0).get(5) == "7f70280757d8beb2d1bfd6fb1b6ae6e9") + assert(dfResults(0).get(6) == "448c357e78317877a98a399448031a89f1dda6fb") + } + + after { + if (sc != null) { + sc.stop() + } + } +} diff --git a/src/test/scala/io/archivesunleashed/app/TextFilesInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/TextFilesInformationExtractorTest.scala new file mode 100644 index 00000000..0d481767 --- /dev/null +++ b/src/test/scala/io/archivesunleashed/app/TextFilesInformationExtractorTest.scala @@ -0,0 +1,61 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import com.google.common.io.Resources +import io.archivesunleashed.RecordLoader +import org.apache.spark.{SparkConf, SparkContext} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{BeforeAndAfter, FunSuite} + +@RunWith(classOf[JUnitRunner]) +class TextFilesInformationExtractorTest extends FunSuite with BeforeAndAfter { + private val arcPath = Resources.getResource("warc/example.txt.warc.gz").getPath + private var sc: SparkContext = _ + private val master = "local[4]" + private val appName = "example-spark" + + before { + val conf = new SparkConf() + .setMaster(master) + .setAppName(appName) + conf.set("spark.driver.allowMultipleContexts", "true") + sc = new SparkContext(conf) + } + + test("Text files information extractor DF") { + val df = RecordLoader.loadArchives(arcPath, sc).textFiles() + val dfResults = TextFilesInformationExtractor(df).collect() + val RESULTSLENGTH = 1 + + assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "https://ruebot.net/files/aut-test-fixtures/aut-text.txt") + assert(dfResults(0).get(1) == "aut-text.txt") + assert(dfResults(0).get(2) == "txt") + assert(dfResults(0).get(3) == "text/plain") + assert(dfResults(0).get(4) == "application/gzip") + assert(dfResults(0).get(5) == "32abd404fb560ecf14b75611f3cc5c2c") + assert(dfResults(0).get(6) == "9dc9d163d933085348e90cd2b6e523e3139d3e88") + } + + after { + if (sc != null) { + sc.stop() + } + } +} diff --git a/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala new file mode 100644 index 00000000..1d4cec03 --- /dev/null +++ b/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala @@ -0,0 +1,61 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import com.google.common.io.Resources +import io.archivesunleashed.RecordLoader +import org.apache.spark.{SparkConf, SparkContext} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{BeforeAndAfter, FunSuite} + +@RunWith(classOf[JUnitRunner]) +class VideoInformationExtractorTest extends FunSuite with BeforeAndAfter { + private val arcPath = Resources.getResource("warc/example.media.warc.gz").getPath + private var sc: SparkContext = _ + private val master = "local[4]" + private val appName = "example-spark" + + before { + val conf = new SparkConf() + .setMaster(master) + .setAppName(appName) + conf.set("spark.driver.allowMultipleContexts", "true") + sc = new SparkContext(conf) + } + + test("Video information extractor DF") { + val df = RecordLoader.loadArchives(arcPath, sc).videos() + val dfResults = VideoInformationExtractor(df).collect() + val RESULTSLENGTH = 1 + + assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "https://ruebot.net/2018-11-12%2016.14.11.mp4") + assert(dfResults(0).get(1) == "2018-11-12%2016.14.11.mp4") + assert(dfResults(0).get(2) == "mp4") + assert(dfResults(0).get(3) == "video/mp4") + assert(dfResults(0).get(4) == "video/mp4") + assert(dfResults(0).get(5) == "2cde7de3213a87269957033f6315fce2") + assert(dfResults(0).get(6) == "f28c72fa4c0464a1a2b81fdc539b28cf574ac4c2") + } + + after { + if (sc != null) { + sc.stop() + } + } +} diff --git a/src/test/scala/io/archivesunleashed/app/WebGraphExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/WebGraphExtractorTest.scala new file mode 100644 index 00000000..96d04894 --- /dev/null +++ b/src/test/scala/io/archivesunleashed/app/WebGraphExtractorTest.scala @@ -0,0 +1,58 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import com.google.common.io.Resources +import io.archivesunleashed.RecordLoader +import org.apache.spark.{SparkConf, SparkContext} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{BeforeAndAfter, FunSuite} + +@RunWith(classOf[JUnitRunner]) +class WebGraphExtractorTest extends FunSuite with BeforeAndAfter { + private val arcPath = Resources.getResource("warc/example.warc.gz").getPath + private var sc: SparkContext = _ + private val master = "local[4]" + private val appName = "example-spark" + + before { + val conf = new SparkConf() + .setMaster(master) + .setAppName(appName) + conf.set("spark.driver.allowMultipleContexts", "true") + sc = new SparkContext(conf) + } + + test("Web graph extractor DF") { + val df = RecordLoader.loadArchives(arcPath, sc).webgraph() + val dfResults = WebGraphExtractor(df).collect() + val RESULTSLENGTH = 622 + + assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "20080430") + assert(dfResults(0).get(1) == "http://www.archive.org/") + assert(dfResults(0).get(2) == "http://www.archive.org") + assert(dfResults(0).get(3) == "http://www.archive.org") + } + + after { + if (sc != null) { + sc.stop() + } + } +} diff --git a/src/test/scala/io/archivesunleashed/app/WebPagesExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/WebPagesExtractorTest.scala index 85807d07..a1be8ed1 100644 --- a/src/test/scala/io/archivesunleashed/app/WebPagesExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/WebPagesExtractorTest.scala @@ -45,10 +45,11 @@ class WebPagesExtractorTest extends FunSuite with BeforeAndAfter { assert(dfResults.length == RESULTSLENGTH) assert(dfResults(0).get(0) == "20080430") - assert(dfResults(0).get(1) == "http://www.archive.org/") - assert(dfResults(0).get(2) == "text/html") + assert(dfResults(0).get(1) == "archive.org") + assert(dfResults(0).get(2) == "http://www.archive.org/") assert(dfResults(0).get(3) == "text/html") - assert(dfResults(0).get(4) == "en") + assert(dfResults(0).get(4) == "text/html") + assert(dfResults(0).get(5) == "en") } after { diff --git a/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala new file mode 100644 index 00000000..2668d713 --- /dev/null +++ b/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala @@ -0,0 +1,61 @@ +/* + * Copyright © 2017 The Archives Unleashed Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.archivesunleashed.app + +import com.google.common.io.Resources +import io.archivesunleashed.RecordLoader +import org.apache.spark.{SparkConf, SparkContext} +import org.junit.runner.RunWith +import org.scalatest.junit.JUnitRunner +import org.scalatest.{BeforeAndAfter, FunSuite} + +@RunWith(classOf[JUnitRunner]) +class WordProcessorInformationExtractorTest extends FunSuite with BeforeAndAfter { + private val arcPath = Resources.getResource("warc/example.docs.warc.gz").getPath + private var sc: SparkContext = _ + private val master = "local[4]" + private val appName = "example-spark" + + before { + val conf = new SparkConf() + .setMaster(master) + .setAppName(appName) + conf.set("spark.driver.allowMultipleContexts", "true") + sc = new SparkContext(conf) + } + + test("Word processor information extractor DF") { + val df = RecordLoader.loadArchives(arcPath, sc).wordProcessorFiles() + val dfResults = WordProcessorInformationExtractor(df).collect() + val RESULTSLENGTH = 3 + + assert(dfResults.length == RESULTSLENGTH) + assert(dfResults(0).get(0) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf") + assert(dfResults(0).get(1) == "test-aut-fixtures.rtf") + assert(dfResults(0).get(2) == "rtf") + assert(dfResults(0).get(3) == "application/rtf") + assert(dfResults(0).get(4) == "application/rtf") + assert(dfResults(0).get(5) == "e483512b65ba44d71e843c57de2adeb7") + assert(dfResults(0).get(6) == "8cf3066421f0a07fcd6e7a3e86ebd447edf7cfcb") + } + + after { + if (sc != null) { + sc.stop() + } + } +}