diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala index 2a24e946..4c239fcb 100644 --- a/src/main/scala/io/archivesunleashed/package.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -165,18 +165,20 @@ package object archivesunleashed { /* Creates a column for Bytes as well in Dataframe. Call KeepImages OR KeepValidPages on RDD depending upon the requirement before calling this method */ def all(): DataFrame = { - val records = rdd.map(r => - Row( - r.getCrawlDate, - r.getUrl, - r.getMimeType, - DetectMimeTypeTika(r.getBinaryBytes), - r.getContentString, - r.getBinaryBytes, - r.getHttpStatus, - r.getArchiveFilename + val records = rdd + .removeFiledesc() + .map(r => + Row( + r.getCrawlDate, + r.getUrl, + r.getMimeType, + DetectMimeTypeTika(r.getBinaryBytes), + r.getContentString, + r.getBinaryBytes, + r.getHttpStatus, + r.getArchiveFilename + ) ) - ) val schema = new StructType() .add(StructField("crawl_date", StringType, true)) @@ -192,6 +194,14 @@ package object archivesunleashed { sqlContext.getOrCreate().createDataFrame(records, schema) } + /** Filters out filedesc:// and dns: records. */ + def removeFiledesc(): RDD[ArchiveRecord] = { + rdd.filter(r => + !r.getUrl.toLowerCase.startsWith("filedesc:") + && !r.getUrl.toLowerCase.startsWith("dns:") + ) + } + /** Removes all non-html-based data (images, executables, etc.) from html text. */ def keepValidPages(): RDD[ArchiveRecord] = { rdd.filter(r => @@ -208,6 +218,7 @@ package object archivesunleashed { /** Extracts webpages with columns for crawl data, url, MIME type, and content. */ def webpages(): DataFrame = { val records = rdd + .removeFiledesc() .keepValidPages() .map(r => Row( @@ -235,6 +246,7 @@ package object archivesunleashed { /** Extracts a webgraph with columns for crawl date, source url, destination url, and anchor text. */ def webgraph(): DataFrame = { val records = rdd + .removeFiledesc() .keepValidPages() .flatMap(r => ExtractLinks(r.getUrl, r.getContentString) @@ -256,6 +268,7 @@ package object archivesunleashed { /* Extracts all the images links from a source page. */ def imagegraph(): DataFrame = { val records = rdd + .removeFiledesc() .keepValidPages() .flatMap(r => ExtractImageLinks(r.getUrl, r.getContentString) diff --git a/src/test/scala/io/archivesunleashed/RecordDFTest.scala b/src/test/scala/io/archivesunleashed/RecordDFTest.scala index 978bbcfb..bf8a1b87 100644 --- a/src/test/scala/io/archivesunleashed/RecordDFTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordDFTest.scala @@ -72,7 +72,7 @@ class RecordDFTest extends FunSuite with BeforeAndAfter { import spark.implicits._ // scalastyle:on - val expected = "000" + val expected = "200" val base = RecordLoader .loadArchives(arcPath, sc) .all() diff --git a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala index fcbfca58..bb1d383f 100644 --- a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala +++ b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala @@ -124,7 +124,7 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { val r_11 = all.select(url, mime_type).take(1)(0) assert( - r_11.getAs[String](url) == "filedesc://IAH-20080430204825-00000-blackbook.arc" + r_11.getAs[String](url) == "http://www.archive.org/robots.txt" ) assert(r_11.getAs[String](mime_type) == "text/plain") }