-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
298 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
|
||
import csv | ||
import os | ||
import re | ||
final_data_list = [] | ||
book_data_list = [] | ||
final_reviews_list = [] | ||
book_data_columns = ["Book Title","Book Subtitle","Book Author", "Book Narrator", "Audio Runtime", "Audiobook_Type", "Categories" , "Rating", "Price"] | ||
book_review_columns = [] | ||
total_review_count ="" | ||
final_column_data =[] | ||
|
||
def data_preprocessing(length,price): | ||
length_sub = length | ||
n_length = re.sub('Length: ', '', length_sub) | ||
price_sub = price | ||
if "Pre" in price_sub: | ||
n_price = re.sub('Pre-order for ', '', price_sub) | ||
if "Reg" in price_sub: | ||
n_price = re.sub('Regular price: ', '', price_sub) | ||
if "Buy" in price_sub: | ||
price_sub = price | ||
n_price = re.sub('Buy for ', '', price_sub) | ||
return(n_length,n_price) | ||
def write_to_csv(book_data_list,final_reviews_list): | ||
global final_data_list | ||
final_data_list = book_data_list + final_reviews_list | ||
csv_file_path = "output_encoded.csv" | ||
# txt_file_path = "output.txt" | ||
# size_of_file = os.path.getsize(csv_file_path) | ||
|
||
try: | ||
if os.path.exists(csv_file_path): | ||
if os.path.getsize(csv_file_path) is 0: | ||
csv_file = open(csv_file_path, "w", encoding="utf-8",newline ="") | ||
writer = csv.writer(csv_file, delimiter = ",") | ||
# quoting = csv.QUOTE_ALL | ||
writer.writerow(final_column_data) | ||
writer.writerow(final_data_list) | ||
# for data in range(len(final_column_data)): | ||
# | ||
# writer.writerow([final_column_data[data]]) | ||
|
||
|
||
# for data in range(len(final_data_list)): | ||
# | ||
# writer.writerow([final_data_list[data]]) | ||
else: | ||
csv_file = open(csv_file_path, "a", encoding="utf-8", newline="") | ||
writer = csv.writer(csv_file, delimiter=",") | ||
writer.writerow(final_data_list) | ||
# for data in range(len(final_data_list)): | ||
# | ||
# writer.writerow([final_data_list[data]]) | ||
else: | ||
csv_file = open(csv_file_path, "w", encoding="utf-8", newline="") | ||
writer = csv.writer(csv_file, delimiter=",") | ||
writer.writerow(final_column_data) | ||
writer.writerow(final_data_list) | ||
# for data in range(len(final_column_data)): | ||
# writer.writerow([final_column_data[data]]) | ||
# for data in range(len(final_data_list)): | ||
# writer.writerow([final_data_list[data]]) | ||
csv_file.close() | ||
except Exception as e: | ||
print(e) | ||
# try: | ||
This comment has been minimized.
Sorry, something went wrong. |
||
# if os.path.exists(txt_file_path): | ||
# if os.path.getsize(txt_file_path) is 0: | ||
# txt_file = open(txt_file_path, "w", encoding=None) | ||
# | ||
# for data in range(len(final_column_data)): | ||
# txt_file.write(final_column_data[data]+",") | ||
# if data == len(final_column_data) - 1: | ||
# txt_file.write("\n") | ||
# for data in range(len(final_data_list)): | ||
# temp = "\""+final_data_list[data]+"\""+"," | ||
# txt_file.write("\""+final_data_list[data]+"\""+",") | ||
# if data == len(final_data_list) - 1: | ||
# txt_file.write("\n") | ||
# else: | ||
# txt_file = open(txt_file_path, "a", encoding=None) | ||
# | ||
# for data in range(len(final_data_list)): | ||
# temp = "\"" + final_data_list[data] + "\"" + "," | ||
# txt_file.write("\"" + final_data_list[data] + "\"" + ",") | ||
# if data == len(final_data_list) - 1: | ||
# txt_file.write("\n") | ||
# else: | ||
# txt_file = open(txt_file_path, "w", encoding=None) | ||
# | ||
# for data in range(len(final_column_data)): | ||
# txt_file.write(final_column_data[data] + ",") | ||
# if data == len(final_column_data) - 1: | ||
# txt_file.write("\n") | ||
# for data in range(len(final_data_list)): | ||
# temp = "\"" + final_data_list[data] + "\"" + "," | ||
# txt_file.write("\"" + final_data_list[data] + "\"" + ",") | ||
# if data == len(final_data_list) - 1: | ||
# txt_file.write("\n") | ||
# txt_file.close() | ||
# except Exception as e: | ||
# print(e) | ||
|
||
This comment has been minimized.
Sorry, something went wrong. |
||
return | ||
def review_column_creator(review_list): | ||
global final_column_data | ||
if len(final_column_data) == 0: | ||
book_review_columns = [] | ||
total_review_counts = review_list | ||
for item in range(1,total_review_counts): | ||
book_review_columns.append("Review "+str(item)) | ||
if item == total_review_counts - 1: | ||
book_review_columns.append("Review"+str(item+1)) | ||
final_column_data = book_data_columns + book_review_columns | ||
return | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
|
||
import time | ||
import re | ||
import audible_data_handling as adh | ||
from selenium import webdriver | ||
from selenium.webdriver.common.keys import Keys | ||
# from selenium.webdriver.support import expected_conditions as EC | ||
from selenium.webdriver.chrome.options import Options | ||
|
||
book_title = "" | ||
book_author = "" | ||
book_narrator = "" | ||
audio_runtime = "" | ||
audiobook_type = "" | ||
categories = "" | ||
rating = "" | ||
book_price = "" | ||
reviews = "" | ||
final_reviews_list =[] | ||
book_data_list =[] | ||
showmore_open_times = 0 | ||
# //div[contains(@class,'Reviews')]/div[1]/div[2]/p[1] | ||
def audible_homepage_open(audible_homepage_link,audible_link,open_times): | ||
global showmore_open_times | ||
showmore_open_times = open_times | ||
chrome_options = Options() | ||
chrome_options.add_extension('C:/chropath/extension_6_1_11_0.crx') | ||
driver = webdriver.Chrome(executable_path="C:/chromedriver/chromedriver.exe", chrome_options = chrome_options) | ||
driver.get(audible_homepage_link) | ||
driver.get(audible_link) | ||
return click_element(driver) | ||
|
||
def click_element(driver): | ||
product_list = driver.find_elements_by_xpath("//div[contains(@data-widget,'productList')]/li") | ||
length_of_product_list = len(product_list) | ||
for item in range (1, length_of_product_list): | ||
book_link = driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//li[1]//a") | ||
# book_link = driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li[3]//li[1]//a") | ||
# tab_switch(driver,book_link,item=3) | ||
tab_switch(driver,book_link,item) | ||
print("end of "+book_link.text) | ||
try: | ||
nextButton = driver.find_element_by_xpath("//span[contains(@class,'nextButton')]") | ||
nextButton.click() | ||
click_element(driver) | ||
except: | ||
print("End of list") | ||
|
||
def tab_switch(driver,book_link,item): | ||
book_link.send_keys(Keys.CONTROL + Keys.ENTER) | ||
driver.switch_to.window(driver.window_handles[1]) | ||
fetch_element_data(driver,item) | ||
|
||
def tab_close(driver): | ||
driver.close() | ||
driver.switch_to_window(driver.window_handles[0]) | ||
|
||
def fetch_element_data(driver,item): | ||
# global final_reviews_list | ||
try: | ||
book_title = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[1]")).text | ||
try: | ||
book_subtitle = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[2]")).text | ||
# print(book_subtitle) | ||
except: | ||
book_subtitle = "" | ||
book_author = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[contains(@class,'authorLabel')]/a")).text | ||
book_narrator = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[contains(@class,'narratorLabel')]/a")).text | ||
audio_runtime = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[contains(@class,'runtimeLabel')]")).text | ||
audiobook_type = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[contains(@class,'format')]")).text | ||
try: | ||
categories = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[contains(@class,'categoriesLabel')]/a")).text | ||
|
||
except: | ||
categories = "" | ||
try: | ||
rating = (driver.find_element_by_xpath("//div[contains(@class,'centerSlot')][2]/div//li[contains(@class,'ratingsLabel')]/span[2]")).text | ||
|
||
except: | ||
rating = "" | ||
book_price = (driver.find_element_by_xpath("//a[contains(@title,"+"\""+book_title+"\""+")]")).text | ||
p_audio_runtime,p_book_price = adh.data_preprocessing(audio_runtime,book_price) | ||
final_reviews_list = reviews_crawler(driver) | ||
book_data_list = [book_title, book_subtitle, book_author,book_narrator, p_audio_runtime,audiobook_type, categories, rating,p_book_price] | ||
adh.write_to_csv(book_data_list,final_reviews_list) | ||
tab_close(driver) | ||
except: | ||
final_reviews_list = reviews_crawler(driver) | ||
tab_close(driver) | ||
book_data_list = data_not_found(driver,item) | ||
adh.write_to_csv(book_data_list,final_reviews_list) | ||
|
||
def reviews_crawler(driver): | ||
local_reviews_list = [] | ||
total_review_columns = showmore_open_times*10 | ||
adh.review_column_creator(total_review_columns) | ||
try: | ||
review_list = driver.find_elements_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div/div[2]/p[1]") | ||
if not review_list: | ||
local_reviews_list = [""]*total_review_columns | ||
else: | ||
for review_item in range(1, showmore_open_times): | ||
time.sleep(3) | ||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | ||
more_reviews = driver.find_element_by_xpath("//span[contains(@class,'showMoreReviews')]") | ||
more_reviews.click() | ||
time.sleep(5) | ||
review_list = driver.find_elements_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div/div[2]/p[1]") | ||
# reviews = "" | ||
for item in range(1, len(review_list)): | ||
# driver.execute_script("window.scroll(0, 0);") | ||
# reviews = driver.find_element_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div[25]/div[2]/p[1]") | ||
# check_review =reviews.text | ||
# review_cleaned = reviews.text.replace('\n', '').replace(' \" ', '\"\"') | ||
reviews = driver.find_element_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div["+str(item)+"]/div[2]/p[1]") | ||
# check_review = reviews.text | ||
review_cleaned = re.sub(r'[^a-zA-Z0-9()\[\]\{\}.,!?\' */\"]', "", reviews.text) | ||
# .replace("\'","").replace(",","") | ||
local_reviews_list.append(review_cleaned) | ||
if item == len(review_list)-1: | ||
last_item = int(len(review_list)) | ||
reviews = driver.find_element_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div[" +str(last_item)+ "]/div[2]/p[1]") | ||
review_cleaned = re.sub(r'[^a-zA-Z0-9()\[\]\{\}.,!?\' */\"]', "", reviews.text) | ||
local_reviews_list.append(review_cleaned) | ||
# print(local_reviews_list) | ||
|
||
|
||
except Exception as e: | ||
review_list = driver.find_elements_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div/div[2]/p[1]") | ||
local_reviews_list = [] | ||
if not review_list: | ||
local_reviews_list = [""] * total_review_columns | ||
else: | ||
for item in range(1, len(review_list)): | ||
reviews = driver.find_element_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div["+str(item)+"]/div[2]/p[1]") | ||
review_cleaned = re.sub(r'[^a-zA-Z0-9()\[\]\{\}.,!?\' */\"]', "", reviews.text) | ||
local_reviews_list.append(review_cleaned) | ||
if item == len(review_list) - 1: | ||
last_item = int(len(review_list)) | ||
reviews = driver.find_element_by_xpath("//div[contains(@class,'ReviewsTabUS')]/div[" + str(last_item) + "]/div[2]/p[1]") | ||
review_cleaned = re.sub(r'[^a-zA-Z0-9()\[\]\{\}.,!?\' */\"]', "", reviews.text) | ||
local_reviews_list.append(review_cleaned) | ||
# try: | ||
# if len(local_reviews_list) == total_review_columns: | ||
# pass | ||
# else: | ||
# for item in range(len(local_reviews_list),total_review_columns): | ||
# local_reviews_list.append(None) | ||
# except Exception as e: | ||
# print(e) | ||
# print(local_reviews_list) | ||
return local_reviews_list | ||
|
||
def data_not_found(driver,item): | ||
# abg = driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//li[1]") | ||
book_title = (driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//h3[contains(@class,'heading')]")).text | ||
try: | ||
book_subtitle = (driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//li[contains(@class,'subtitle')]")).text | ||
# print(book_subtitle) | ||
except: | ||
book_subtitle = "" | ||
book_author = (driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//li[contains(@class,'authorLabel')]//a")).text | ||
try: | ||
book_narrator = (driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//li[contains(@class,'narratorLabel')]//a")).text | ||
except: | ||
book_narrator = "" | ||
audio_runtime = (driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//li[contains(@class,'runtimeLabel')]")).text | ||
audiobook_type = "" | ||
categories = "" | ||
try: | ||
rating = (driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//li[contains(@class,'ratingsLabel')]/span[1]")).text | ||
except: | ||
rating = "" | ||
book_price = (driver.find_element_by_xpath("//div[contains(@data-widget,'productList')]/li["+str(item)+"]//div[contains(@class,'BuyBox')]/p[1]")).text | ||
p_audio_runtime, p_book_price = adh.data_preprocessing(audio_runtime, book_price) | ||
book_data_list = [book_title, book_subtitle, book_author,book_narrator, p_audio_runtime,audiobook_type, categories, rating,p_book_price] | ||
|
||
return book_data_list | ||
|
||
|
||
|
if you want it in text