Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
rohit7044 authored May 24, 2021
1 parent 8dbfac0 commit 4b29a18
Show file tree
Hide file tree
Showing 2 changed files with 298 additions and 0 deletions.
117 changes: 117 additions & 0 deletions src/audible_data_handling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@

import csv
import os
import re
final_data_list = []
book_data_list = []
final_reviews_list = []
book_data_columns = ["Book Title","Book Subtitle","Book Author", "Book Narrator", "Audio Runtime", "Audiobook_Type", "Categories" , "Rating", "Price"]
book_review_columns = []
total_review_count =""
final_column_data =[]

def data_preprocessing(length,price):
length_sub = length
n_length = re.sub('Length: ', '', length_sub)
price_sub = price
if "Pre" in price_sub:
n_price = re.sub('Pre-order for ', '', price_sub)
if "Reg" in price_sub:
n_price = re.sub('Regular price: ', '', price_sub)
if "Buy" in price_sub:
price_sub = price
n_price = re.sub('Buy for ', '', price_sub)
return(n_length,n_price)
def write_to_csv(book_data_list,final_reviews_list):
global final_data_list
final_data_list = book_data_list + final_reviews_list
csv_file_path = "output_encoded.csv"
# txt_file_path = "output.txt"
# size_of_file = os.path.getsize(csv_file_path)

try:
if os.path.exists(csv_file_path):
if os.path.getsize(csv_file_path) is 0:
csv_file = open(csv_file_path, "w", encoding="utf-8",newline ="")
writer = csv.writer(csv_file, delimiter = ",")
# quoting = csv.QUOTE_ALL
writer.writerow(final_column_data)
writer.writerow(final_data_list)
# for data in range(len(final_column_data)):
#
# writer.writerow([final_column_data[data]])


# for data in range(len(final_data_list)):
#
# writer.writerow([final_data_list[data]])
else:
csv_file = open(csv_file_path, "a", encoding="utf-8", newline="")
writer = csv.writer(csv_file, delimiter=",")
writer.writerow(final_data_list)
# for data in range(len(final_data_list)):
#
# writer.writerow([final_data_list[data]])
else:
csv_file = open(csv_file_path, "w", encoding="utf-8", newline="")
writer = csv.writer(csv_file, delimiter=",")
writer.writerow(final_column_data)
writer.writerow(final_data_list)
# for data in range(len(final_column_data)):
# writer.writerow([final_column_data[data]])
# for data in range(len(final_data_list)):
# writer.writerow([final_data_list[data]])
csv_file.close()
except Exception as e:
print(e)
# try:

This comment has been minimized.

Copy link
@rohit7044

rohit7044 May 24, 2021

Author Owner

if you want it in text

# if os.path.exists(txt_file_path):
# if os.path.getsize(txt_file_path) is 0:
# txt_file = open(txt_file_path, "w", encoding=None)
#
# for data in range(len(final_column_data)):
# txt_file.write(final_column_data[data]+",")
# if data == len(final_column_data) - 1:
# txt_file.write("\n")
# for data in range(len(final_data_list)):
# temp = "\""+final_data_list[data]+"\""+","
# txt_file.write("\""+final_data_list[data]+"\""+",")
# if data == len(final_data_list) - 1:
# txt_file.write("\n")
# else:
# txt_file = open(txt_file_path, "a", encoding=None)
#
# for data in range(len(final_data_list)):
# temp = "\"" + final_data_list[data] + "\"" + ","
# txt_file.write("\"" + final_data_list[data] + "\"" + ",")
# if data == len(final_data_list) - 1:
# txt_file.write("\n")
# else:
# txt_file = open(txt_file_path, "w", encoding=None)
#
# for data in range(len(final_column_data)):
# txt_file.write(final_column_data[data] + ",")
# if data == len(final_column_data) - 1:
# txt_file.write("\n")
# for data in range(len(final_data_list)):
# temp = "\"" + final_data_list[data] + "\"" + ","
# txt_file.write("\"" + final_data_list[data] + "\"" + ",")
# if data == len(final_data_list) - 1:
# txt_file.write("\n")
# txt_file.close()
# except Exception as e:
# print(e)

This comment has been minimized.

Copy link
@rohit7044

rohit7044 May 24, 2021

Author Owner

End of Text Conversion

return
def review_column_creator(review_list):
global final_column_data
if len(final_column_data) == 0:
book_review_columns = []
total_review_counts = review_list
for item in range(1,total_review_counts):
book_review_columns.append("Review "+str(item))
if item == total_review_counts - 1:
book_review_columns.append("Review"+str(item+1))
final_column_data = book_data_columns + book_review_columns
return

181 changes: 181 additions & 0 deletions src/audible_webpage_automation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@

import time
import re
import audible_data_handling as adh
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

book_title = ""
book_author = ""
book_narrator = ""
audio_runtime = ""
audiobook_type = ""
categories = ""
rating = ""
book_price = ""
reviews = ""
final_reviews_list =[]
book_data_list =[]
showmore_open_times = 0
# //div[contains(@class,'Reviews')]/div[1]/div[2]/p[1]
def audible_homepage_open(audible_homepage_link,audible_link,open_times):
global showmore_open_times
showmore_open_times = open_times
chrome_options = Options()
chrome_options.add_extension('C:/chropath/extension_6_1_11_0.crx')
driver = webdriver.Chrome(executable_path="C:/chromedriver/chromedriver.exe", chrome_options = chrome_options)
driver.get(audible_homepage_link)
driver.get(audible_link)
return click_element(driver)

def click_element(driver):
product_list = driver.find_elements_by_xpath("//div[contains(@data-widget,'productList')]/li")
length_of_product_list = len(product_list)
for item in range (1, length_of_product_list):
book_link = driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//li[1]//a")
# book_link = driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li[3]//li[1]//a")
# tab_switch(driver,book_link,item=3)
tab_switch(driver,book_link,item)
print("end of "+book_link.text)
try:
nextButton = driver.find_element_by_xpath("//span[contains(@class,'nextButton')]")
nextButton.click()
click_element(driver)
except:
print("End of list")

def tab_switch(driver,book_link,item):
book_link.send_keys(Keys.CONTROL + Keys.ENTER)
driver.switch_to.window(driver.window_handles[1])
fetch_element_data(driver,item)

def tab_close(driver):
driver.close()
driver.switch_to_window(driver.window_handles[0])

def fetch_element_data(driver,item):
# global final_reviews_list
try:
book_title = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[1]")).text
try:
book_subtitle = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[2]")).text
# print(book_subtitle)
except:
book_subtitle = ""
book_author = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[contains(@class,'authorLabel')]/a")).text
book_narrator = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[contains(@class,'narratorLabel')]/a")).text
audio_runtime = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[contains(@class,'runtimeLabel')]")).text
audiobook_type = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[contains(@class,'format')]")).text
try:
categories = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[contains(@class,'categoriesLabel')]/a")).text

except:
categories = ""
try:
rating = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[contains(@class,'ratingsLabel')]/span[2]")).text

except:
rating = ""
book_price = (driver.find_element_by_xpath("//a[contains(@title,"+"\""+book_title+"\""+")]")).text
p_audio_runtime,p_book_price = adh.data_preprocessing(audio_runtime,book_price)
final_reviews_list = reviews_crawler(driver)
book_data_list = [book_title, book_subtitle, book_author,book_narrator, p_audio_runtime,audiobook_type, categories, rating,p_book_price]
adh.write_to_csv(book_data_list,final_reviews_list)
tab_close(driver)
except:
final_reviews_list = reviews_crawler(driver)
tab_close(driver)
book_data_list = data_not_found(driver,item)
adh.write_to_csv(book_data_list,final_reviews_list)

def reviews_crawler(driver):
local_reviews_list = []
total_review_columns = showmore_open_times*10
adh.review_column_creator(total_review_columns)
try:
review_list = driver.find_elements_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div/div[2]/p[1]")
if not review_list:
local_reviews_list = [""]*total_review_columns
else:
for review_item in range(1, showmore_open_times):
time.sleep(3)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
more_reviews = driver.find_element_by_xpath("//span[contains(@class,'showMoreReviews')]")
more_reviews.click()
time.sleep(5)
review_list = driver.find_elements_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div/div[2]/p[1]")
# reviews = ""
for item in range(1, len(review_list)):
# driver.execute_script("window.scroll(0, 0);")
# reviews = driver.find_element_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div[25]/div[2]/p[1]")
# check_review =reviews.text
# review_cleaned = reviews.text.replace('\n', '').replace(' \" ', '\"\"')
reviews = driver.find_element_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div["+str(item)+"]/div[2]/p[1]")
# check_review = reviews.text
review_cleaned = re.sub(r'[^a-zA-Z0-9()\[\]\{\}.,!?\' */\"]', "", reviews.text)
# .replace("\'","").replace(",","")
local_reviews_list.append(review_cleaned)
if item == len(review_list)-1:
last_item = int(len(review_list))
reviews = driver.find_element_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div[" +str(last_item)+ "]/div[2]/p[1]")
review_cleaned = re.sub(r'[^a-zA-Z0-9()\[\]\{\}.,!?\' */\"]', "", reviews.text)
local_reviews_list.append(review_cleaned)
# print(local_reviews_list)


except Exception as e:
review_list = driver.find_elements_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div/div[2]/p[1]")
local_reviews_list = []
if not review_list:
local_reviews_list = [""] * total_review_columns
else:
for item in range(1, len(review_list)):
reviews = driver.find_element_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div["+str(item)+"]/div[2]/p[1]")
review_cleaned = re.sub(r'[^a-zA-Z0-9()\[\]\{\}.,!?\' */\"]', "", reviews.text)
local_reviews_list.append(review_cleaned)
if item == len(review_list) - 1:
last_item = int(len(review_list))
reviews = driver.find_element_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div[" + str(last_item) + "]/div[2]/p[1]")
review_cleaned = re.sub(r'[^a-zA-Z0-9()\[\]\{\}.,!?\' */\"]', "", reviews.text)
local_reviews_list.append(review_cleaned)
# try:
# if len(local_reviews_list) == total_review_columns:
# pass
# else:
# for item in range(len(local_reviews_list),total_review_columns):
# local_reviews_list.append(None)
# except Exception as e:
# print(e)
# print(local_reviews_list)
return local_reviews_list

def data_not_found(driver,item):
# abg = driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//li[1]")
book_title = (driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//h3[contains(@class,'heading')]")).text
try:
book_subtitle = (driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//li[contains(@class,'subtitle')]")).text
# print(book_subtitle)
except:
book_subtitle = ""
book_author = (driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//li[contains(@class,'authorLabel')]//a")).text
try:
book_narrator = (driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//li[contains(@class,'narratorLabel')]//a")).text
except:
book_narrator = ""
audio_runtime = (driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//li[contains(@class,'runtimeLabel')]")).text
audiobook_type = ""
categories = ""
try:
rating = (driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//li[contains(@class,'ratingsLabel')]/span[1]")).text
except:
rating = ""
book_price = (driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//div[contains(@class,'BuyBox')]/p[1]")).text
p_audio_runtime, p_book_price = adh.data_preprocessing(audio_runtime, book_price)
book_data_list = [book_title, book_subtitle, book_author,book_narrator, p_audio_runtime,audiobook_type, categories, rating,p_book_price]

return book_data_list



0 comments on commit 4b29a18

Please sign in to comment.