From a9ecc6b1a3b3abd67dad79bc99a1c6b29ae8ab0b Mon Sep 17 00:00:00 2001 From: Ronny Soutart Date: Sun, 4 Jul 2021 20:23:05 +1100 Subject: [PATCH] feat(userDetails): Add user details crawler #3 --- .../sdk/imgflip/sdk/ImgFlipURLHelper.java | 1 + .../imgflip/sdk/imgflip/sdk/UserCrawler.java | 227 ++++++++++++++++++ .../sdk/imgflip/sdk/domain/Stream.java | 32 +++ .../sdk/imgflip/sdk/domain/Submission.java | 32 +++ .../sdk/imgflip/sdk/domain/Template.java | 54 +++++ .../imgflip/sdk/imgflip/sdk/domain/User.java | 159 ++++++++++++ 6 files changed, 505 insertions(+) create mode 100644 src/main/java/com/github/adriens/imgflip/sdk/imgflip/sdk/UserCrawler.java create mode 100644 src/main/java/com/github/adriens/imgflip/sdk/imgflip/sdk/domain/Stream.java create mode 100644 src/main/java/com/github/adriens/imgflip/sdk/imgflip/sdk/domain/Submission.java create mode 100644 src/main/java/com/github/adriens/imgflip/sdk/imgflip/sdk/domain/Template.java create mode 100644 src/main/java/com/github/adriens/imgflip/sdk/imgflip/sdk/domain/User.java diff --git a/src/main/java/com/github/adriens/imgflip/sdk/imgflip/sdk/ImgFlipURLHelper.java b/src/main/java/com/github/adriens/imgflip/sdk/imgflip/sdk/ImgFlipURLHelper.java index 227e14b..d463ba8 100644 --- a/src/main/java/com/github/adriens/imgflip/sdk/imgflip/sdk/ImgFlipURLHelper.java +++ b/src/main/java/com/github/adriens/imgflip/sdk/imgflip/sdk/ImgFlipURLHelper.java @@ -20,6 +20,7 @@ public class ImgFlipURLHelper { public static final String IMGFLIP_ROOT_URL = "https://imgflip.com"; public static final String POPULAR_STREAM_URL = "/streams"; public static final String TOP_USERS_URL = "/topusers"; + public static final String USER_URL = "/user"; public static String getPagePath(String inStream, int page) { String out = IMGFLIP_ROOT_URL; diff --git a/src/main/java/com/github/adriens/imgflip/sdk/imgflip/sdk/UserCrawler.java b/src/main/java/com/github/adriens/imgflip/sdk/imgflip/sdk/UserCrawler.java new file mode 100644 index 0000000..2e8b681 --- /dev/null +++ b/src/main/java/com/github/adriens/imgflip/sdk/imgflip/sdk/UserCrawler.java @@ -0,0 +1,227 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package com.github.adriens.imgflip.sdk.imgflip.sdk; + +import com.gargoylesoftware.htmlunit.WebClient; +import com.gargoylesoftware.htmlunit.html.DomElement; +import com.gargoylesoftware.htmlunit.html.Html; +import com.gargoylesoftware.htmlunit.html.HtmlElement; +import com.gargoylesoftware.htmlunit.html.HtmlPage; +import com.github.adriens.imgflip.sdk.imgflip.sdk.base.Crawler; +import com.github.adriens.imgflip.sdk.imgflip.sdk.domain.Stream; +import com.github.adriens.imgflip.sdk.imgflip.sdk.domain.Submission; +import com.github.adriens.imgflip.sdk.imgflip.sdk.domain.Template; +import com.github.adriens.imgflip.sdk.imgflip.sdk.domain.User; +import com.github.adriens.imgflip.sdk.imgflip.sdk.domain.enums.RankIcon; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.math.BigDecimal; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.List; + +public class UserCrawler extends Crawler { + + final static Logger logger = LoggerFactory.getLogger(UserCrawler.class); + + public static final String URL = ImgFlipURLHelper.IMGFLIP_ROOT_URL.concat(ImgFlipURLHelper.USER_URL); + + public static String getUserUrl(String userName) { + final String userUrl = StringUtils.join(Arrays.asList(URL, userName), "/"); + + logger.debug("url for user {} : {}", userName, userUrl); + return StringUtils.join(Arrays.asList(URL, userName), "/"); + } + + public static User getUser(String userName) throws IOException { + return getUser(URL, userName); + } + + private static User getUser(String url, String userName) throws IOException { + User user = new User(); + + WebClient webClient = buildWebClient(); + HtmlPage page = webClient.getPage(String.format("%s/%s", url, userName)); + + // general + DomElement titleElements = page.getElementById("user-title"); + user.setRankIcon(getRankIcon(titleElements)); + user.setUserName(getUsername(titleElements)); + user.setPoints(getPoints(titleElements)); + + // joined date + HtmlElement joinElement = page.getHtmlElementById("user-joined"); + user.setJoinDate(getJoinedDate(joinElement)); + + // tag line + HtmlElement tagLineElement = page.getHtmlElementById("user-tagline"); + user.setTagLine(getTagLine(tagLineElement)); + + // stats + DomElement statsElements = page.getFirstByXPath(".//div[@class='user-stats']"); + setStats(statsElements, user); + + String mediasPath = ".//div[@class='user-imgs-wrap'][%s]/div[@class='user-imgs']/a"; + + // submissions & templates + List submissionsElements = page.getByXPath(String.format(mediasPath, 1)); + user.setLatestSubmissions(getSubmissions(submissionsElements)); + + List templatesElements = page.getByXPath(String.format(mediasPath, 2)); + user.setTopUploadedTemplates(getTemplates(templatesElements)); + + // streams moderated & followed + List streamsModeratedElements = page.getByXPath("//*[@id='user-streams']/div"); + user.setStreamsModerated(getStreams(streamsModeratedElements)); + + List streamsFollowedElements = page.getByXPath("//*[@id='user-streams-followed']/div"); + user.setStreamsFollowed(getStreams(streamsFollowedElements)); + + logger.debug("{}", user); + return user; + } + + private static RankIcon getRankIcon(DomElement dom) { + HtmlElement rankIconElement = dom.getFirstByXPath(".//div"); + if (rankIconElement != null) { + String cleanId = rankIconElement.getAttribute("class").replace("ico ", ""); + return RankIcon.urlForId(cleanId); + } + + return null; + } + + private static String getUsername(DomElement dom) { + HtmlElement userNameElement = dom.getFirstByXPath(".//span[@class='u-username']"); + if (userNameElement != null) + return userNameElement.getTextContent(); + + return null; + } + + private static BigDecimal getPoints(DomElement dom) { + HtmlElement pointsElement = dom.getFirstByXPath(".//span[@id='user-points']"); + if (pointsElement != null) + return new BigDecimal(pointsElement.getTextContent() + .replace("(", "") + .replace(")", "")); + + return null; + } + + private static Date getJoinedDate(HtmlElement html) { + if (html != null) { + String cleanDate = html.getTextContent().replace("Joined ", ""); + try { + return new SimpleDateFormat("yyyy-MM-dd").parse(cleanDate); + } catch (ParseException e) { + logger.error("Error parsing joinedDate : {}", cleanDate); + } + } + + return null; + } + + private static String getTagLine(HtmlElement html) { + if (html != null) + return html.getTextContent(); + + return null; + } + + private static void setStats(DomElement dom, User user) { + + // get sub-elements by index position + user.setFeaturedImages(getStat(dom, 1, " Featured Images")); + user.setCreations(getStat(dom, 2, " Creations")); + user.setComments(getStat(dom, 3, " Comments")); + user.setFollowers(getStat(dom, 4, " Followers")); + } + + private static Integer getStat(DomElement dom, Integer position, String removeText) { + final String path = String.format(".//div[@class='user-stat'][%s]", position); + + HtmlElement html = dom.getFirstByXPath(path); + if (html != null) { + String content = html.getTextContent().replace(removeText, ""); + return Integer.valueOf(content); + } + + return null; + } + + private static List getSubmissions(List htmls) { + List submissions = new ArrayList<>(); + + for (HtmlElement html : htmls) { + Submission submission = new Submission(); + + HtmlElement img = html.getFirstByXPath(".//img"); + if (img != null) + submission.setThumbnail(ImgFlipURLHelper.addHttpsProtocolIfMissing(img.getAttribute("src"))); + + submission.setUrl(ImgFlipURLHelper.IMGFLIP_ROOT_URL.concat(html.getAttribute("href"))); + submissions.add(submission); + } + + return submissions; + } + + private static List