From 87c9734d89061905849020ee660d9d5978394e3e Mon Sep 17 00:00:00 2001 From: Nick Ruest Date: Mon, 10 Feb 2020 10:08:03 -0500 Subject: [PATCH] Rename imageLinks to imagegraph; resolves #419 (#421) * Rename imageLinks to imagegraph; resolves #419 --- src/main/python/aut/common.py | 2 +- src/main/scala/io/archivesunleashed/DataFrameLoader.scala | 4 ++-- src/main/scala/io/archivesunleashed/package.scala | 2 +- .../scala/io/archivesunleashed/df/DataFrameLoaderTest.scala | 4 ++-- .../scala/io/archivesunleashed/df/ExtractImageLinksTest.scala | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main/python/aut/common.py b/src/main/python/aut/common.py index 73d158ad..ded1ae03 100644 --- a/src/main/python/aut/common.py +++ b/src/main/python/aut/common.py @@ -21,7 +21,7 @@ def images(self): return DataFrame(self.loader.images(self.path), self.sqlContext) def image_links(self): - return DataFrame(self.loader.imageLinks(self.path), self.sqlContext) + return DataFrame(self.loader.imagegraph(self.path), self.sqlContext) def pdfs(self): return DataFrame(self.loader.pdfs(self.path), self.sqlContext) diff --git a/src/main/scala/io/archivesunleashed/DataFrameLoader.scala b/src/main/scala/io/archivesunleashed/DataFrameLoader.scala index a608a763..94d6f180 100644 --- a/src/main/scala/io/archivesunleashed/DataFrameLoader.scala +++ b/src/main/scala/io/archivesunleashed/DataFrameLoader.scala @@ -41,9 +41,9 @@ class DataFrameLoader(sc: SparkContext) { } /* Create a DataFrame with source page, and image url. */ - def imageLinks(path: String): DataFrame = { + def imagegraph(path: String): DataFrame = { RecordLoader.loadArchives(path, sc) - .imageLinks() + .imagegraph() } /** Create a DataFrame with image url, filename, extension, mime_type_web_servr, mime_type_tika, width, height, md5, and raw bytes. */ diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala index fff79edd..b5c69595 100644 --- a/src/main/scala/io/archivesunleashed/package.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -378,7 +378,7 @@ package object archivesunleashed { } /* Extracts all the images links from a source page. */ - def imageLinks(): DataFrame = { + def imagegraph(): DataFrame = { val records = rdd .keepValidPages() .flatMap(r => ({ diff --git a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala index 6b6e14cc..5b629205 100644 --- a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala +++ b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala @@ -48,7 +48,7 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { val df = new DataFrameLoader(sc) val validPages = df.webpages(arcPath) val hyperlinks = df.webgraph(arcPath) - val imageLinks = df.imageLinks(arcPath) + val imagegraph = df.imagegraph(arcPath) val images = df.images(arcPath) val pdfs = df.pdfs(pdfPath) val audio = df.audio(mediaPath) @@ -67,7 +67,7 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { assert(r_2(0) == "http://web.archive.org/collections/web/advanced.html") assert(r_2(1) == "Advanced Search") - val r_3 = imageLinks.take(100)(99) + val r_3 = imagegraph.take(100)(99) assert(r_3.get(0) == "20080430") assert(r_3.get(1) == "http://www.archive.org/details/secretarmiesb00spivrich") assert(r_3.get(2) == "http://www.archive.org/images/star.png") diff --git a/src/test/scala/io/archivesunleashed/df/ExtractImageLinksTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractImageLinksTest.scala index f7e9453a..4fa200b2 100644 --- a/src/test/scala/io/archivesunleashed/df/ExtractImageLinksTest.scala +++ b/src/test/scala/io/archivesunleashed/df/ExtractImageLinksTest.scala @@ -40,7 +40,7 @@ class ImageLinksTest extends FunSuite with BeforeAndAfter { test("Image links extraction DF") { val df = RecordLoader.loadArchives(arcPath, sc) - .imageLinks() + .imagegraph() // We need this in order to use the $-notation val spark = SparkSession.builder().master("local").getOrCreate()