-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathFacebook-Keywords-Scraper.py
150 lines (113 loc) · 7.06 KB
/
Facebook-Keywords-Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
# Set up your own custom web driver with your local data in it (you are logged in to their facebook account)
service = Service(executable_path = r"C:\Users\USER\Documents\ChromeDriver\chromedriver.exe")
option = Options()
chrome_options = webdriver.ChromeOptions()
option.add_experimental_option("debuggerAddress", "localhost:9222")
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options = chrome_options)
from bs4 import BeautifulSoup
import time
from collections import OrderedDict
import pandas
pandas.set_option('display.max_colwidth', None)
pandas.set_option('display.max_rows', None)
# URl encode the users search query to put in the url
import urllib.parse
def encodeURL(query):
return urllib.parse.quote(query)
def facebook(query, num):
# Preparing the URL for it and encoding the user query to match the URL
url = 'https://www.facebook.com/search/posts/?q=' + encodeURL(query) + '&filters=eyJyZWNlbnRfcG9zdHM6MCI6IntcIm5hbWVcIjpcInJlY2VudF9wb3N0c1wiLFwiYXJnc1wiOlwiXCJ9In0%3D'
# Getting the web driver with custom options and establishing a connection with the prepared URL
driver = webdriver.Chrome(service=service, options=option)
try:
driver.get(url)
# Wait for the website to load initially
time.sleep(3)
temp, flat, htmlLinks = [], [], []
# This range is set because a scrolling action will be taken by the chrome driver, as the posts load dynamically when you reach the end of the page. Decreasing it will result in fewer searches.
for _ in range(1, num):
try:
# Deliberate sleep is added because the posts are loaded dynamically, so the script has to wait for the next set of posts to load when you reach the bottom.
time.sleep(1)
hover = driver.find_elements(By.CSS_SELECTOR, ".x1i10hfl.xjbqb8w.x6umtig.x1b1mbwd.xaqea5y.xav7gou.x9f619.x1ypdohk.xt0psk2.xe8uvvx.xdj266r.x11i5rnm.xat24cr.x1mh8g0r.xexx8yu.x4uap5.x18d9i69.xkhd6sd.x16tdsg8.x1hl2dhg.xggy1nq.x1a2a7pz.x1heor9g.xt0b8zv.xo1l8bm")
# Using action chains to create a hover effect because the URL of a Facebook post is hidden unless you hover over the timestamp.
actions = ActionChains(driver)
for element in hover:
try:
actions.move_to_element(element).perform()
except Exception as e:
continue
except Exception as e:
print(f"An error occurred while performing hover action: {e}")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
# HTML of all the links are taken from the website
temp.append(soup.find_all('a', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g xt0b8zv xo1l8bm'))
# Flatten the list
flat = [row for rows in temp for row in rows]
# There are duplicates because the scroller option jumps when it reaches the end and the next set of posts is loaded. Removing the duplicates by OrderedDict, which makes sure that we retain the original order of posts/links
htmlLinks = list(OrderedDict.fromkeys(flat))
total = sum(len(row) for row in htmlLinks)
if total >= num*2: #Taking extra posts just in case some error occurs
break
# Scrolling action performed here
current = driver.execute_script("return window.pageYOffset;")
driver.execute_script(f"window.scrollBy({current}+1000, {current}+2000);")
# Creating a list of links extracted from the HTML
links, counter = [], 0
for link in htmlLinks:
try:
if 'facebook' in link['href']:
if counter < num:
if link['href'] not in links:
counter += 1
links.append(link['href'])
else:
break
except Exception as e:
print(f"An error occurred while processing link: {e}")
# Close the WebDriver
driver.quit()
# Extact data from each facebook page retrieved
driver = webdriver.Chrome(service=service, options=option)
nameList, urlList = [], []
for link in links:
try:
driver.get(link)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
nameHTML = soup.find('a', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f')
if nameHTML is not None:
nameList.append(nameHTML.text.strip())
else:
# There are some facebook pages with the span tag, meaning their names can't be clicked. It may be due to their account being deleted, removed etc
nameHTML = soup.find('span', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f')
if nameHTML is not None:
nameList.append(nameHTML.text.strip())
else:
nameHTML = soup.find('div', class_='x1i10hfl xjbqb8w x6umtig x1b1mbwd xaqea5y xav7gou x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f')
if nameHTML is not None:
nameList.append(nameHTML.text.strip())
else:
nameList.append('')
urlList.append(driver.current_url)
except Exception as e:
print(f"An error occurred while processing link: {e}")
# Close the WebDriver
driver.quit()
# Create a DataFrame from the lists and save it in a CSV format
output = pandas.DataFrame({'User Name': nameList, 'Post URL': urlList})
output.to_csv('Facebook-Unprocessed.csv', index=False)
except Exception as e:
print(f"An error occurred while running the script: {e}")
keyword = ''
maxSearch = 0
facebook(keyword, maxSearch)