Skip to content

Commit

Permalink
Add binary extration DataFrames to PySpark.
Browse files Browse the repository at this point in the history
- Address #190
- Address #259
- Address #302
- Address #303
- Address #304
- Address #305
- Address #306
- Address #307
  • Loading branch information
ruebot committed Aug 20, 2019
1 parent 448601e commit 1176fd5
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 2 deletions.
27 changes: 27 additions & 0 deletions src/main/python/aut/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,30 @@ def pages(self):

def links(self):
return DataFrame(self.loader.extractHyperlinks(self.path), self.sqlContext)

def images(self):
return DataFrame(self.loader.extractImages(self.path), self.sqlContext)

def image_links(self):
return DataFrame(self.loader.extractImageLinks(self.path), self.sqlContext)

def pdfs(self):
return DataFrame(self.loader.extractPDFs(self.path), self.sqlContext)

def audio(self):
return DataFrame(self.loader.extractAudio(self.path), self.sqlContext)

def video(self):
return DataFrame(self.loader.extractVideo(self.path), self.sqlContext)

def spreadsheets(self):
return DataFrame(self.loader.extractSpreadsheets(self.path), self.sqlContext)

def presentation_program(self):
return DataFrame(self.loader.extractPresentationProgram(self.path), self.sqlContext)

def word_processor(self):
return DataFrame(self.loader.extractWordProcessor(self.path), self.sqlContext)

def text_files(self):
return DataFrame(self.loader.extractTextFiles(self.path), self.sqlContext)
48 changes: 46 additions & 2 deletions src/main/scala/io/archivesunleashed/DataFrameLoader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,69 @@ import org.apache.spark.SparkContext
import org.apache.spark.sql.DataFrame

class DataFrameLoader(sc: SparkContext) {

/** Create a DataFram with crawl_date, url, mime_type_web_server, and content. */
def extractValidPages(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.extractValidPagesDF()
}

/** Create a DataFrame with crawl_date, source, destination, and anchor. */
def extractHyperlinks(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.extractHyperlinksDF()
}

/* Create a dataframe with (source page, image url) pairs */
/* Create a DataFrame with source page, and image url. */
def extractImageLinks(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.extractImageLinksDF()
}

/** Create a dataframe with (image url, type, width, height, md5, raw bytes) pairs */
/** Create a DataFrame with image url, filename, extension, mime_type_web_servr, mime_type_tika, width, height, md5, and raw bytes. */
def extractImages(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.extractImageDetailsDF()
}

/** Create a DataFrame with PDF url, filename, extension, mime_type_web_servr, mime_type_tika, md5, and raw bytes. */
def extractPDFs(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.extractPDFDetailsDF
}
/** Create a DataFrame with audio url, filename, extension, mime_type_web_servr, mime_type_tika, md5, and raw bytes. */
def extractAudio(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.extractAudioDetailsDF
}

/** Create a DataFrame with video url, filename, extension, mime_type_web_servr, mime_type_tika, md5, and raw bytes. */
def extractVideo(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.extractVideoDetailsDF
}

/** Create a DataFrame with spreadsheet url, filename, extension, mime_type_web_servr, mime_type_tika, md5, and raw bytes. */
def extractSpreadsheets(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.extractSpreadsheetDetailsDF
}

/** Create a DataFrame with presentation program url, filename, extension, mime_type_web_servr, mime_type_tika, md5, and raw bytes. */
def extractPresentationProgram(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.extractPresentationProgramDetailsDF
}

/** Create a DataFrame with word processor url, filename, extension, mime_type_web_servr, mime_type_tika, md5, and raw bytes. */
def extractWordProcessor(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.extractWordProcessorDetailsDF
}

/** Create a DataFrame with text file url, filename, extension, mime_type_web_servr, mime_type_tika, md5, and raw bytes. */
def extractTextFiles(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.extractTextFilesDetailsDF
}
}

0 comments on commit 1176fd5

Please sign in to comment.