diff --git a/.gitignore b/.gitignore index ddd3a616c1..b2de0b22ba 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,6 @@ ftest/*.csv *.sqlar *-wal *-shm -*.csv /crawl /downloaded diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py index 542f93b28a..f49daa8c9f 100644 --- a/minet/reddit/scraper.py +++ b/minet/reddit/scraper.py @@ -296,6 +296,7 @@ def get_comments(self, url: str, all): yield data def get_general_post(self, url: str, type: str, add_text: bool, limit: int): + fn = data_posts if type == "subreddit" else data_user_posts n_crawled = 0 old_url = get_old_url(get_url_from_subreddit(url)) while old_url and (limit is None or n_crawled < limit): @@ -328,71 +329,37 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int): post_url, self.pool_manager ) if text_error: - if type == "subreddit": - yield data_posts( - post, - title, - post_url, - "", - upvote, - n_comments_scraped, - n_comments, - published_date, - edited_date, - link, - text_error, - ) - else: - yield data_user_posts( - post, - title, - post_url, - "", - upvote, - n_comments_scraped, - n_comments, - published_date, - edited_date, - link, - text_error, - ) - try_content = text_soup.select_one("div#siteTable div.usertext") - if try_content: - content = try_content.get_text() - else: - content = "" - else: - content = "" - if type == "subreddit": - post = data_posts( - post, - title, - post_url, - content, - upvote, - n_comments_scraped, - n_comments, - published_date, - edited_date, - link, - error, + yield fn( + post, + title, + post_url, + None, + upvote, + n_comments_scraped, + n_comments, + published_date, + edited_date, + link, + text_error, + ) + content = text_soup.scrape_one( + "div#siteTable div.usertext-body" ) else: - post = data_user_posts( - post, - title, - post_url, - content, - upvote, - n_comments_scraped, - n_comments, - published_date, - edited_date, - link, - error, - ) - - yield post + content = "" + yield fn( + post, + title, + post_url, + content, + upvote, + n_comments_scraped, + n_comments, + published_date, + edited_date, + link, + error, + ) n_crawled += 1 old_url = soup.scrape_one("span.next-button a", "href")