Skip to content

Commit

Permalink
Merge pull request karussell#36 from skyshard/abhishek/CRAWL-163_CRAW…
Browse files Browse the repository at this point in the history
…L-170

Fixed extraction issues
  • Loading branch information
andresp99999 authored Jun 5, 2017
2 parents 37b5d15 + d93b115 commit 601ded6
Show file tree
Hide file tree
Showing 4 changed files with 2,675 additions and 1 deletion.
5 changes: 4 additions & 1 deletion src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,9 @@ public boolean hasHTMLTags(String text){
aMap.put("sltrib.com", Arrays.asList(
"#main-content > div.row"
));
aMap.put("sfchronicle.com", Arrays.asList(
"div[class=article-text]"
));

BEST_ELEMENT_PER_DOMAIN = Collections.unmodifiableMap(aMap);
}
Expand Down Expand Up @@ -293,7 +296,7 @@ public ArticleTextExtractor() {
+ "login|si(debar|gn|ngle)");
setPositive("(^(body|content|h?entry|main|page|post|text|blog|story|haupt))"
+ "|arti(cle|kel)|instapaper_body|storybody|short-story|storycontent|articletext|story-primary|^newsContent$|dcontainer|announcement-details");
setHighlyPositive("news-release-detail|storybody|main-content|articlebody|article_body|article-body|html-view-content|entry__body|^main-article$|^article__content$|^articleContent$|^mainEntityOfPage$|art_body_article|^article_text$|main-article-chapter|post-body");
setHighlyPositive("news-detail-content|news-release-detail|storybody|main-content|articlebody|article_body|article-body|html-view-content|entry__body|^main-article$|^article__content$|^articleContent$|^mainEntityOfPage$|art_body_article|^article_text$|main-article-chapter|post-body");
setNegative("nav($|igation)|user|com(ment|bx)|(^com-)|contact|"
+ "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
+ "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard|title|truncate|slider|^sectioncolumns$|ad-container");
Expand Down
30 changes: 30 additions & 0 deletions src/test/java/de/jetwick/snacktory/ArticleTextExtractorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3080,6 +3080,36 @@ public void testPublicNet() throws Exception {
compareDates("2017-05-12 00:00:00", res.getDate());
}

@Test
public void testMorningStar() throws Exception {
// http://www.morningstar.com/news/associated-press/urn:publicid:ap.org:f8d53c4370434744a864d4afa5fa8d36/hackers-break-into-centralized-password-manager-onelogin.html
JResult res = new JResult();
res.setUrl("http://www.morningstar.com/news/associated-press/urn:publicid:ap.org:f8d53c4370434744a864d4afa5fa8d36/hackers-break-into-centralized-password-manager-onelogin.html");
res = extractor.extractContent(res, c.streamToString(getClass().getResourceAsStream("morningstar.html")));
assertEquals("http://www.morningstar.com/news/associated-press/urn:publicid:ap.org:f8d53c4370434744a864d4afa5fa8d36/hackers-break-into-centralized-password-manager-onelogin.html", res.getCanonicalUrl());
assertEquals("Hackers break into centralized password manager OneLogin", res.getTitle());
assertTrue(res.getText(), res.getText().startsWith("NEW YORK (AP) — Hackers have gained access to OneLogin,"));
assertTrue(res.getText(), res.getText().endsWith("although not actual passwords."));
assertEquals(StringUtils.EMPTY, res.getAuthorName());
assertEquals(StringUtils.EMPTY, res.getAuthorDescription());
compareDates("2017-06-02 00:00:00", res.getDate());
}

@Test
public void testSfchronicle() throws Exception {
// http://www.sfchronicle.com/business/article/Odd-jobs-matchmaker-Thumbtack-gets-big-funds-6541824.php
JResult res = new JResult();
res.setUrl("http://www.sfchronicle.com/business/article/Odd-jobs-matchmaker-Thumbtack-gets-big-funds-6541824.php");
res = extractor.extractContent(res, c.streamToString(getClass().getResourceAsStream("sfchronicle.html")));
assertEquals("http://www.sfchronicle.com/business/article/Odd-jobs-matchmaker-Thumbtack-gets-big-funds-6541824.php", res.getCanonicalUrl());
assertEquals("Odd-jobs matchmaker Thumbtack gets big funds, joins unicorn club - San Francisco Chronicle", res.getTitle());
assertTrue(res.getText(), res.getText().startsWith("San Francisco’s Thumbtack, which introduces fix-it folks,"));
assertTrue(res.getText(), res.getText().endsWith("Carolyn Said is a San Francisco Chronicle staff writer. E-mail: csaid@sfchronicle.com Twitter: @csaid"));
assertEquals("Carolyn Said", res.getAuthorName());
assertEquals("Carolyn Said is a San Francisco Chronicle staff writer. E-mail: csaid@sfchronicle.com Twitter: @csaid", res.getAuthorDescription());
compareDates("2015-09-30 00:00:00", res.getDate());
}

public static void compareDates(String expectedDateString, Date actual) {
String[] patterns = {
"yyyy-MM-dd",
Expand Down
Loading

0 comments on commit 601ded6

Please sign in to comment.