From cb98d9a658e30b00a87a7f23dc8971f39e2048db Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Mon, 19 Feb 2024 22:07:38 +0100 Subject: [PATCH] #965 filter ard topics --- .../parser/ZdfTopicsPageHtmlDeserializer.java | 35 +++++++--- src/test/resources/zdf/zdf_topics_page1.html | 70 +++++++++++++++++++ 2 files changed, 94 insertions(+), 11 deletions(-) diff --git a/src/main/java/de/mediathekview/mserver/crawler/zdf/parser/ZdfTopicsPageHtmlDeserializer.java b/src/main/java/de/mediathekview/mserver/crawler/zdf/parser/ZdfTopicsPageHtmlDeserializer.java index f5f674f4c..12fee71d5 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/zdf/parser/ZdfTopicsPageHtmlDeserializer.java +++ b/src/main/java/de/mediathekview/mserver/crawler/zdf/parser/ZdfTopicsPageHtmlDeserializer.java @@ -1,31 +1,44 @@ package de.mediathekview.mserver.crawler.zdf.parser; +import static de.mediathekview.mserver.base.HtmlConsts.ATTRIBUTE_HREF; + import de.mediathekview.mserver.base.utils.UrlUtils; import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; import de.mediathekview.mserver.crawler.zdf.ZdfConstants; -import org.jsoup.nodes.Document; -import org.jsoup.select.Elements; - import java.util.HashSet; import java.util.Set; - -import static de.mediathekview.mserver.base.HtmlConsts.ATTRIBUTE_HREF; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; public class ZdfTopicsPageHtmlDeserializer { - private static final String LINK_SELECTOR = "article h3 a"; + private static final String ARTICLE_SELECTOR = "article"; + private static final String LINK_SELECTOR = "h3 a"; + private static final String TEASER_SELECTOR = "dd.teaser-info span"; public Set deserialize(final Document document) { final Set results = new HashSet<>(); - Elements filmUrls = document.select(LINK_SELECTOR); + Elements filmUrls = document.select(ARTICLE_SELECTOR); filmUrls.forEach( - filmUrlElement -> { - String url = filmUrlElement.attr(ATTRIBUTE_HREF); - url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE); - results.add(new CrawlerUrlDTO(url)); + articleElement -> { + final Element filmUrlElement = articleElement.selectFirst(LINK_SELECTOR); + final Element teaserElement = articleElement.selectFirst(TEASER_SELECTOR); + if (filmUrlElement != null && isRelevant(teaserElement)) { + String url = filmUrlElement.attr(ATTRIBUTE_HREF); + url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE); + results.add(new CrawlerUrlDTO(url)); + } }); return results; } + + private boolean isRelevant(Element teaserElement) { + if (teaserElement == null) { + return true; + } + return !("ARD".equalsIgnoreCase(teaserElement.text())); + } } diff --git a/src/test/resources/zdf/zdf_topics_page1.html b/src/test/resources/zdf/zdf_topics_page1.html index f9e13dabb..1381b3f70 100644 --- a/src/test/resources/zdf/zdf_topics_page1.html +++ b/src/test/resources/zdf/zdf_topics_page1.html @@ -900,8 +900,78 @@

+
+
+
+ + + + + All You Need + +
+
+
+ +

+ + + + All You Need  + + +

+ + + + + + + + + + +
+ +
+ +
+
+
+ +
+ +