diff --git a/.readthedocs.yaml b/.readthedocs.yaml index b62c917..9ca1aa9 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -20,9 +20,9 @@ sphinx: configuration: docs/conf.py # Optionally build your docs in additional formats such as PDF and ePub -# formats: -# - pdf -# - epub +formats: + - pdf + - epub # Optional but recommended, declare the Python requirements required # to build your documentation diff --git a/MetaDataScraper.egg-info/PKG-INFO b/MetaDataScraper.egg-info/PKG-INFO index f378879..eaf1da3 100644 --- a/MetaDataScraper.egg-info/PKG-INFO +++ b/MetaDataScraper.egg-info/PKG-INFO @@ -1,9 +1,15 @@ Metadata-Version: 2.1 Name: MetaDataScraper -Version: 1.0.2 +Version: 1.0.3 Summary: A module designed to automate the extraction of follower counts and post details from a public Facebook page. Author-email: Ishan Surana +Maintainer-email: Ishan Surana Project-URL: Homepage, https://metadatascraper.readthedocs.io/en/latest/ +Project-URL: Documentation, https://metadatascraper.readthedocs.io/en/latest/ +Project-URL: Repository, https://github.com/ishan-surana/MetaDataScraper +Project-URL: Changelog, https://github.com/ishan-surana/MetaDataScraper/releases +Project-URL: Issues, https://github.com/ishan-surana/MetaDataScraper/issues +Keywords: facebook,scraper,meta,selenium,webdriver-manager,automation,web-scraping,web-crawling,web-automation,facebook-scraper,facebook-web-scraper,meta-scraper Classifier: Programming Language :: Python :: 3 Classifier: License :: OSI Approved :: Apache Software License Classifier: Operating System :: Microsoft :: Windows diff --git a/MetaDataScraper/FacebookScraper.py b/MetaDataScraper/FacebookScraper.py index 89ab1da..579ff20 100644 --- a/MetaDataScraper/FacebookScraper.py +++ b/MetaDataScraper/FacebookScraper.py @@ -58,7 +58,8 @@ class LoginlessScraper: ------- To scrape a Facebook page: - ```python + ```python + from MetaDataScraper import LoginlessScraper scraper = LoginlessScraper("page_id") data = scraper.scrape() @@ -190,7 +191,10 @@ def __extract_post_details(self): _c = 1 _error_count = 0 while True: - _xpath = self._xpath_first+str(c)+self._xpath_identifier_addum+self._xpath_last + if _c > 100: + print("Reached 100 posts. Exiting extraction...\n\n") + break + _xpath = self._xpath_first+str(_c)+self._xpath_identifier_addum+self._xpath_last if not self.driver.find_elements(By.XPATH, _xpath): _error_count += 1 if _error_count < 3: @@ -368,7 +372,8 @@ class LoggedInScraper: ------- To scrape a Facebook page: - ```python + ```python + from MetaDataScraper import LoggedInScraper scraper = LoggedInScraper("page_id", "email", "password") data = scraper.scrape() @@ -422,23 +427,22 @@ def __setup_driver(self): def __login(self): """Logs into Facebook using the provided credentials.""" - logged_in = False - while not logged_in: - if self.driver.find_elements(By.ID, 'not_me_link'): - self.driver.find_element(By.ID, 'not_me_link').click() - self.driver.get('https://www.facebook.com/login') - self.driver.find_element(By.NAME, 'email').clear() - self.driver.find_element(By.NAME, 'email').send_keys(self.email) - self.driver.find_element(By.NAME, 'pass').clear() - self.driver.find_element(By.NAME, 'pass').send_keys(self.password) - self.driver.find_element(By.ID, 'loginbutton').click() - # Wait until the login process is completed - WebDriverWait(self.driver, 10).until(EC.url_changes('https://www.facebook.com/login')) - if self.driver.current_url != 'https://www.facebook.com/?sk=welcome': - print("Invalid credentials. Please try again.", end='\r') - else: - print(" "*100, end='\r') - logged_in = True + self._logged_in = False + if self.driver.find_elements(By.ID, 'not_me_link'): + self.driver.find_element(By.ID, 'not_me_link').click() + self.driver.get('https://www.facebook.com/login') + self.driver.find_element(By.NAME, 'email').clear() + self.driver.find_element(By.NAME, 'email').send_keys(self.email) + self.driver.find_element(By.NAME, 'pass').clear() + self.driver.find_element(By.NAME, 'pass').send_keys(self.password) + self.driver.find_element(By.ID, 'loginbutton').click() + # Wait until the login process is completed + WebDriverWait(self.driver, 10).until(EC.url_changes('https://www.facebook.com/login')) + if self.driver.current_url != 'https://www.facebook.com/?sk=welcome': + raise Exception("Invalid credentials. Please try again.") + else: + print(" "*100, end='\r') + self._logged_in = True def __navigate_to_page(self): """Navigates to the specified Facebook page.""" @@ -522,7 +526,7 @@ def __extract_post_details(self): _c = 1 _error_count = 0 while True: - _xpath = self._xpath_first + str(c) + self._xpath_identifier_addum + self._xpath_last + _xpath = self._xpath_first + str(_c) + self._xpath_identifier_addum + self._xpath_last if not self.driver.find_elements(By.XPATH, _xpath): _error_count += 1 if _error_count < 3: @@ -587,6 +591,7 @@ def __extract_post_details(self): def scrape(self): """Initiates the scraping process and returns a dictionary with the scraped data.""" + self._logged_in = False self.__setup_driver() self.__login() self.__navigate_to_page() @@ -595,8 +600,8 @@ def scrape(self): self.__scroll_to_top() self.__get_xpath_constructor() self.__extract_post_details() - self.driver.quit() print("\033[A\033[A\033[A") # DevTools line deleter + self.driver.quit() return { 'followers': self.followers, 'post_texts': self.post_texts, diff --git a/dist/MetaDataScraper-1.0.2-py3-none-any.whl b/dist/MetaDataScraper-1.0.2-py3-none-any.whl deleted file mode 100644 index 89e0b70..0000000 Binary files a/dist/MetaDataScraper-1.0.2-py3-none-any.whl and /dev/null differ diff --git a/dist/MetaDataScraper-1.0.3-py3-none-any.whl b/dist/MetaDataScraper-1.0.3-py3-none-any.whl new file mode 100644 index 0000000..deebfa1 Binary files /dev/null and b/dist/MetaDataScraper-1.0.3-py3-none-any.whl differ diff --git a/dist/metadatascraper-1.0.2.tar.gz b/dist/metadatascraper-1.0.2.tar.gz deleted file mode 100644 index 79fed33..0000000 Binary files a/dist/metadatascraper-1.0.2.tar.gz and /dev/null differ diff --git a/dist/metadatascraper-1.0.3.tar.gz b/dist/metadatascraper-1.0.3.tar.gz new file mode 100644 index 0000000..a5c417d Binary files /dev/null and b/dist/metadatascraper-1.0.3.tar.gz differ diff --git a/docs/conf.py b/docs/conf.py index d54c4d2..4eab44c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,14 +1,20 @@ # Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html -# -- Project information +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = 'MetaDataScraper' +copyright = '2024, Ishan Surana' author = 'Ishan Surana' - +release = '1.0.3' repo_url = 'https://github.com/ishan-surana/MetaDataScraper/' -version = '1.0.0' +version = '1.0.3' -# -- General configuration +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = [ 'sphinx.ext.duration', @@ -17,7 +23,10 @@ 'sphinx.ext.autosummary', 'sphinx.ext.intersphinx', 'myst_parser', + 'sphinx_design', + 'sphinx_copybutton', ] +myst_enable_extensions = ["colon_fence"] source_suffix = { '.rst': 'restructuredtext', @@ -33,9 +42,15 @@ templates_path = ['_templates'] -# -- Options for HTML output +exclude_patterns = [] + + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'sphinx_rtd_theme' +html_theme = 'pydata_sphinx_theme' +html_static_path = ['_static'] # -- Options for EPUB output -epub_show_urls = 'footnote' +epub_show_urls = 'footnote' \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index bba9f2a..0b20a91 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,3 +1,9 @@ +:html_theme.sidebar_secondary.remove: +.. MetaDataScraper documentation master file, created by + sphinx-quickstart on Sun Aug 4 20:19:27 2024. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + Welcome to the MetaDataScraper documentation! =================================== @@ -13,5 +19,14 @@ Contents -------- .. toctree:: + :maxdepth: 2 README + +.. seealso:: + + Source Repository + `GitHub `_ + + Sponsorship + `Sponsorship `_ \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index b590d50..69cd1d3 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,5 @@ sphinx==7.1.2 -sphinx-rtd-theme==1.3.0rc1 -myst_parser \ No newline at end of file +myst_parser +sphinx-design +pydata-sphinx-theme +sphinx-copybutton \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index ba7624a..ee5166d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "MetaDataScraper" -version = "1.0.2" +version = "1.0.3" authors = [ { name="Ishan Surana", email="ishansurana1234@gmail.com" }, ]