From 5ef2920342865af432652888d2606d291ad49fbf Mon Sep 17 00:00:00 2001 From: nruest Date: Tue, 21 Apr 2020 18:56:17 -0400 Subject: [PATCH 1/5] Update PlainTextExtractor to output a single column; text. - Resolves #452 - PlainTextExtractor runs RemoveHTML, and ExtractBoilerplate on `content` - Update test --- .../io/archivesunleashed/app/PlainTextExtractor.scala | 5 ++--- .../app/PlainTextExtractorTest.scala | 11 ++++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala b/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala index d110a172..896b0876 100644 --- a/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala @@ -17,7 +17,7 @@ package io.archivesunleashed.app import io.archivesunleashed.ArchiveRecord -import io.archivesunleashed.df.{ExtractDomainDF, RemoveHTMLDF, +import io.archivesunleashed.df.{ExtractBoilerpipeTextDF, RemoveHTMLDF, RemoveHTTPHeaderDF} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} @@ -32,7 +32,6 @@ object PlainTextExtractor { // scalastyle:off import spark.implicits._ // scalastyle:on - d.select($"crawl_date", ExtractDomainDF($"url").as("domain"), - $"url", RemoveHTMLDF(RemoveHTTPHeaderDF($"content")).as("text")) + d.select(ExtractBoilerpipeTextDF(RemoveHTMLDF($"content")).as("content")) } } diff --git a/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala index db134f15..f61e4295 100644 --- a/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala @@ -38,16 +38,17 @@ class PlainTextExtractorTest extends FunSuite with BeforeAndAfter { sc = new SparkContext(conf) } - test("Plain text extractor RDD & DF") { + test("Plain text extractor") { val df = RecordLoader.loadArchives(arcPath, sc).webpages() val dfResults = PlainTextExtractor(df).collect() val RESULTSLENGTH = 94 assert(dfResults.length == RESULTSLENGTH) - assert(dfResults(0).get(0) == "20080430") - assert(dfResults(0).get(1) == "www.archive.org") - assert(dfResults(0).get(2) == "http://www.archive.org/") - assert(dfResults(0).get(3) == "Please visit our website at: http://www.archive.org") + assert(dfResults(0).get(0) == "") + assert(dfResults(34).get(0) + .toString + .startsWith("Internet Archive Frequently Asked Questions Web | ")) + assert(dfResults(50).get(0) == "") } after { From 04bd881958f0a404a27ab1a51b61bcc88c2d8c9d Mon Sep 17 00:00:00 2001 From: nruest Date: Tue, 21 Apr 2020 21:35:33 -0400 Subject: [PATCH 2/5] Add option to save to Parquet for app. - Resolves #448 - Update test - Add CSV headers to coalesced CSV output --- .../app/CommandLineApp.scala | 106 +++++++++++++++--- .../app/CommandLineAppTest.scala | 20 +++- 2 files changed, 107 insertions(+), 19 deletions(-) diff --git a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala index 18f96866..267fe801 100644 --- a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala +++ b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala @@ -117,7 +117,11 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).audio()) } - save(AudioInformationExtractor(df)) + if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + saveParquet(AudioInformationExtractor(df)) + } else { + saveCsv(AudioInformationExtractor(df)) + } }), "DomainFrequencyExtractor" -> ((inputFiles: List[String]) => { @@ -125,7 +129,11 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webpages()) } - save(DomainFrequencyExtractor(df)) + if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + saveParquet(DomainFrequencyExtractor(df)) + } else { + saveCsv(DomainFrequencyExtractor(df)) + } }), "DomainGraphExtractor" -> ((inputFiles: List[String]) => { @@ -136,11 +144,13 @@ class CommandLineApp(conf: CmdAppConf) { if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "gexf") { new File(saveTarget).mkdirs() WriteGEXF(DomainGraphExtractor(df).collect(), Paths.get(saveTarget).toString + "/GEXF.gexf") + } else if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + saveParquet(DomainGraphExtractor(df)) } else if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "graphml") { new File(saveTarget).mkdirs() WriteGraphML(DomainGraphExtractor(df).collect(), Paths.get(saveTarget).toString + "/GRAPHML.graphml") } else { - save(DomainGraphExtractor(df)) + saveCsv(DomainGraphExtractor(df)) } }), "ImageInformationExtractor" -> @@ -149,7 +159,11 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).images()) } - save(ImageInformationExtractor(df)) + if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + saveParquet(ImageInformationExtractor(df)) + } else { + saveCsv(ImageInformationExtractor(df)) + } }), "ImageGraphExtractor" -> ((inputFiles: List[String]) => { @@ -157,7 +171,11 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).imagegraph()) } - save(ImageGraphExtractor(df)) + if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + saveParquet(ImageGraphExtractor(df)) + } else { + saveCsv(ImageGraphExtractor(df)) + } }), "PDFInformationExtractor" -> ((inputFiles: List[String]) => { @@ -165,7 +183,11 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).pdfs()) } - save(PDFInformationExtractor(df)) + if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + saveParquet(PDFInformationExtractor(df)) + } else { + saveCsv(PDFInformationExtractor(df)) + } }), "PlainTextExtractor" -> ((inputFiles: List[String]) => { @@ -173,7 +195,11 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webpages()) } - save(PlainTextExtractor(df)) + if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + saveParquet(PlainTextExtractor(df)) + } else { + saveCsv(PlainTextExtractor(df)) + } }), "PresentationProgramInformationExtractor" -> ((inputFiles: List[String]) => { @@ -181,7 +207,11 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).presentationProgramFiles()) } - save(PresentationProgramInformationExtractor(df)) + if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + saveParquet(PresentationProgramInformationExtractor(df)) + } else { + saveCsv(PresentationProgramInformationExtractor(df)) + } }), "SpreadsheetInformationExtractor" -> ((inputFiles: List[String]) => { @@ -189,7 +219,11 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).spreadsheets()) } - save(SpreadsheetInformationExtractor(df)) + if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + saveParquet(SpreadsheetInformationExtractor(df)) + } else { + saveCsv(SpreadsheetInformationExtractor(df)) + } }), "TextFilesInformationExtractor" -> ((inputFiles: List[String]) => { @@ -197,7 +231,11 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).textFiles()) } - save(TextFilesInformationExtractor(df)) + if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + saveParquet(TextFilesInformationExtractor(df)) + } else { + saveCsv(TextFilesInformationExtractor(df)) + } }), "VideoInformationExtractor" -> ((inputFiles: List[String]) => { @@ -205,7 +243,11 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).videos()) } - save(VideoInformationExtractor(df)) + if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + saveParquet(VideoInformationExtractor(df)) + } else { + saveCsv(VideoInformationExtractor(df)) + } }), "WebGraphExtractor" -> ((inputFiles: List[String]) => { @@ -213,7 +255,11 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webgraph()) } - save(WebGraphExtractor(df)) + if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + saveParquet(WebGraphExtractor(df)) + } else { + saveCsv(WebGraphExtractor(df)) + } }), "WebPagesExtractor" -> ((inputFiles: List[String]) => { @@ -221,7 +267,11 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webpages()) } - save(WebPagesExtractor(df)) + if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + saveParquet(WebPagesExtractor(df)) + } else { + saveCsv(WebPagesExtractor(df)) + } }), "WordProcessorInformationExtractor" -> ((inputFiles: List[String]) => { @@ -229,21 +279,26 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).wordProcessorFiles()) } - save(WordProcessorInformationExtractor(df)) + if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + saveParquet(WordProcessorInformationExtractor(df)) + } else { + saveCsv(WordProcessorInformationExtractor(df)) + } }) ) - /** Generic routine for saving Dataset obtained from querying DataFrames to file. + /** Routine for saving Dataset obtained from querying DataFrames to CSV. * Files may be merged according to options specified in 'partition' setting. * * @param d generic dataset obtained from querying DataFrame * @return Unit */ - def save(d: Dataset[Row]): Unit = { + def saveCsv(d: Dataset[Row]): Unit = { if (!configuration.partition.isEmpty) { d.coalesce(configuration.partition()).write .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .option("header", "true") .csv(saveTarget) } else { d.write @@ -252,6 +307,25 @@ class CommandLineApp(conf: CmdAppConf) { } } + /** Routine for saving Dataset obtained from querying DataFrames to Parquet. + * Files may be merged according to options specified in 'partition' setting. + * + * @param d generic dataset obtained from querying DataFrame + * @return Unit + */ + + def saveParquet(d: Dataset[Row]): Unit = { + if (!configuration.partition.isEmpty) { + d.coalesce(configuration.partition()).write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .parquet(saveTarget) + } else { + d.write + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .parquet(saveTarget) + } + } + /** Verify the validity of command line arguments regarding input and output files. * * All input files need to exist, and ouput files should not exist, for this to pass. diff --git a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala index 9ebd02d9..3968188f 100644 --- a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala +++ b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala @@ -45,34 +45,48 @@ class CommandLineAppTest extends FunSuite with BeforeAndAfter { private val testSuccessCmds = Array( Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "GEXF"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "GRAPHML"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "parquet"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "gexf"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "graphml"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--split"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, "--partition", "1", extractOpt, plainTextOpt), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--partition", "1") + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--partition", "1"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--output-format", "parquet") ) private val testFailCmds = Array( From fd5fc249d003d43cbeefda7e86cca673ee532312 Mon Sep 17 00:00:00 2001 From: nruest Date: Wed, 22 Apr 2020 10:11:50 -0400 Subject: [PATCH 3/5] CommandLineAppTest updates --- .../app/CommandLineAppTest.scala | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala index 3968188f..45547bd1 100644 --- a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala +++ b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala @@ -44,6 +44,8 @@ class CommandLineAppTest extends FunSuite with BeforeAndAfter { private var sc: SparkContext = _ private val testSuccessCmds = Array( Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor", "--output-format", "parquet"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "gexf"), @@ -53,40 +55,52 @@ class CommandLineAppTest extends FunSuite with BeforeAndAfter { Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--split"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--output-format", "parquet"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, "--partition", "1", extractOpt, plainTextOpt), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--output-format", "parquet"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--output-format", "parquet"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--output-format", "parquet"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--output-format", "parquet"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--output-format", "parquet"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--output-format", "parquet"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--output-format", "parquet"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor", "--output-format", "parquet"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--output-format", "parquet"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--output-format", "parquet"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--output-format", "parquet") + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--output-format", "parquet"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--output-format", "parquet", "--partition", "1") ) private val testFailCmds = Array( From 3ddbc9994bd249877f926854d2a8e3666fbefc9a Mon Sep 17 00:00:00 2001 From: nruest Date: Wed, 22 Apr 2020 10:37:25 -0400 Subject: [PATCH 4/5] CommandLineAppTest updates --- .../archivesunleashed/app/CommandLineAppTest.scala | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala index 45547bd1..3db57f39 100644 --- a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala +++ b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala @@ -44,6 +44,7 @@ class CommandLineAppTest extends FunSuite with BeforeAndAfter { private var sc: SparkContext = _ private val testSuccessCmds = Array( Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor", "--split"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt), @@ -58,46 +59,57 @@ class CommandLineAppTest extends FunSuite with BeforeAndAfter { Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, "--partition", "1", extractOpt, plainTextOpt), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--split"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--split"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--split"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--split"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--split"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--split"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--split"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor", "--split"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "TextFilesInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--split"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--split"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--output-format", "parquet", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor"), + Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--split"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--partition", "1"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--output-format", "parquet"), Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--output-format", "parquet", "--partition", "1") From 1f89295a3e19c02e1f1baa50cf0bfaf462fc8777 Mon Sep 17 00:00:00 2001 From: nruest Date: Wed, 22 Apr 2020 12:55:32 -0400 Subject: [PATCH 5/5] README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b6e7b844..94c4f6ce 100644 --- a/README.md +++ b/README.md @@ -77,8 +77,8 @@ spark-submit --class io.archivesunleashed.app.CommandLinAppRunner PATH_TO_AUT_JA Additional flags include: -* `--output-format FORMAT` (Used only for the `DomainGraphExtractor`, and the - options are `TEXT` (default) or `GEXF`.) +* `--output-format FORMAT` (`csv` (default) or `parquet`. `DomainGraphExtractor` + has two additional output options `graphml` or `gexf`.) * `--split` (The extractor will put results for each input file in its own directory. Each directory name will be the name of the ARC/WARC file parsed.) * `--partition N` (The extractor will partition RDD or DataFrame according to N