Skip to content

Commit

Permalink
Updates to web scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
hlysine committed Feb 25, 2022
1 parent 792bb5d commit f7bb0e5
Show file tree
Hide file tree
Showing 7 changed files with 18 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@ public override async Task<bool> ValidateQueryAsync(string url, string word)

public override async Task<string> GetWordAsync(ChromiumWebBrowser browser)
{
var headword = await browser.GetInnerTextByXPath(@"//span[contains(@class,""headword"")]");
var headword = await browser.GetInnerTextByXPath(@"//span[contains(@class,'headword')]");
if (!string.IsNullOrWhiteSpace(headword))
return headword;
headword = Regex.Match(browser.Address, @"dictionary\.cambridge\.org\/dictionary\/[\w-_]+\/([\w_-]+)").Groups[1].Value;
return headword;
}

public override async Task<string> GetDescriptionAsync(ChromiumWebBrowser browser)
=> await browser.GetInnerTextByXPath(@"//div[contains(@class,""def ddef_d"")]", @"//span[contains(@class,""trans dtrans"")]");
=> await browser.GetInnerTextByXPath(@"//div[contains(@class,'def ddef_d')]", @"//span[contains(@class,'trans dtrans')]");

public override PackIconKind Icon => PackIconKind.LetterCBox;

Expand Down
6 changes: 3 additions & 3 deletions QuickDictionary/Models/Dictionaries/DictionaryDotCom.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ public override async Task<bool> ValidateQueryAsync(string url, string word)

public override async Task<string> GetWordAsync(ChromiumWebBrowser browser)
{
var headword = await browser.GetInnerTextByXPath(@"//h1[@class=""css-1jzk4d9 e1rg2mtf8""]");
var headword = await browser.GetInnerTextByXPath(@"//*[@data-first-headword='true']");
if (!string.IsNullOrWhiteSpace(headword))
return headword;
var match = Regex.Match(browser.Address, @"www\.dictionary\.com\/definition\/[\w-_]+\/([^?]+)");
var match = Regex.Match(browser.Address, @"www\.dictionary\.com\/browse\/([^?]+)");
if (match.Success)
{
headword = WebUtility.UrlDecode(match.Groups[1].Value);
Expand All @@ -34,7 +34,7 @@ public override async Task<string> GetWordAsync(ChromiumWebBrowser browser)
}

public override async Task<string> GetDescriptionAsync(ChromiumWebBrowser browser)
=> await browser.GetInnerTextByXPath(@"//div[@class=""css-1ghs5zt e1q3nk1v3""]");
=> await browser.GetInnerTextByXPath(@"//*[contains(@class, 'one-click-content')]");

public override PackIconKind Icon => PackIconKind.LetterDBox;

Expand Down
9 changes: 5 additions & 4 deletions QuickDictionary/Models/Dictionaries/GoogleSearchDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using System.Threading.Tasks;
using CefSharp.Wpf;
using MaterialDesignThemes.Wpf;
using QuickDictionary.Utils;

namespace QuickDictionary.Models.Dictionaries;

Expand All @@ -15,11 +16,11 @@ public override bool ValidateUrl(string url)
public override Task<bool> ValidateQueryAsync(string url, string word) => Task.FromResult(true);

// todo: try web-scraping google search
public override Task<string> GetWordAsync(ChromiumWebBrowser browser)
=> Task.FromResult<string>(null);
public override async Task<string> GetWordAsync(ChromiumWebBrowser browser)
=> await browser.GetInnerTextByXPath(@"//*[@data-dobid='hdw']");

public override Task<string> GetDescriptionAsync(ChromiumWebBrowser browser)
=> Task.FromResult<string>(null);
public override async Task<string> GetDescriptionAsync(ChromiumWebBrowser browser)
=> await browser.GetInnerTextByXPath(@"//*[@data-dobid='dfn']");

public override PackIconKind Icon => PackIconKind.Google;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public override Task<string> GetWordAsync(ChromiumWebBrowser browser)
=> Task.FromResult<string>(null);

public override async Task<string> GetDescriptionAsync(ChromiumWebBrowser browser)
=> await browser.GetInnerTextByXPath(@"//div[contains(@class,""J0lOec"")]");
=> await browser.GetInnerTextByXPath(@"//div[contains(@class,'J0lOec')]");

public override PackIconKind Icon => PackIconKind.GoogleTranslate;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ public override async Task<bool> ValidateQueryAsync(string url, string word)

public override async Task<string> GetWordAsync(ChromiumWebBrowser browser)
{
var headword = await browser.GetInnerTextByXPath(@"//h1[contains(@class,""hword"")]");
var headword = await browser.GetInnerTextByXPath(@"//h1[contains(@class,'hword')]");
if (!string.IsNullOrWhiteSpace(headword))
return headword;
headword = Regex.Match(browser.Address, @"www\.merriam-webster\.com\/[\w-_]+\/([\w_-]+)").Groups[1].Value;
return headword;
}

public override async Task<string> GetDescriptionAsync(ChromiumWebBrowser browser)
=> await browser.GetInnerTextByXPath(@"//span[@class=""dtText""]");
=> await browser.GetInnerTextByXPath(@"//span[@class='dtText']");

public override PackIconKind Icon => PackIconKind.MedicalBag;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,18 @@ public override async Task<bool> ValidateQueryAsync(string url, string word)

public override async Task<string> GetWordAsync(ChromiumWebBrowser browser)
{
var headword = await browser.GetInnerTextByXPath(@"//h1[@class=""headword""]");
var headword = await browser.GetInnerTextByXPath(@"//h1[@class='headword']");
if (!string.IsNullOrWhiteSpace(headword))
return headword;
headword = await browser.GetInnerTextByXPath(@"//h2[@class=""h""]");
headword = await browser.GetInnerTextByXPath(@"//h2[@class='h']");
if (!string.IsNullOrWhiteSpace(headword))
return headword;
headword = Regex.Match(browser.Address, @"www\.oxfordlearnersdictionaries\.com\/definition\/[\w-_]+\/([\w_-]+)").Groups[1].Value;
return headword;
}

public override async Task<string> GetDescriptionAsync(ChromiumWebBrowser browser)
=> await browser.GetInnerTextByXPath(@"//span[@class=""def""]");
=> await browser.GetInnerTextByXPath(@"//span[@class='def']");

public override PackIconKind Icon => PackIconKind.LetterOBox;

Expand Down
4 changes: 2 additions & 2 deletions QuickDictionary/Models/Dictionaries/WikipediaDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public override async Task<bool> ValidateQueryAsync(string url, string word)

public override async Task<string> GetWordAsync(ChromiumWebBrowser browser)
{
var headword = await browser.GetInnerTextByXPath(@"//div[@class=""page-heading""]");
var headword = await browser.GetInnerTextByXPath(@"//div[@class='page-heading']");
if (!string.IsNullOrWhiteSpace(headword))
return headword;
var match = Regex.Match(browser.Address, @"wikipedia\.org\/w\/index\.php\?title=([^&]+)");
Expand All @@ -42,7 +42,7 @@ public override async Task<string> GetWordAsync(ChromiumWebBrowser browser)

public override async Task<string> GetDescriptionAsync(ChromiumWebBrowser browser)
{
var res = await browser.GetInnerTextByXPath(@"(//div[@id=""bodyContent""]//p[not(@class)])[1]");
var res = await browser.GetInnerTextByXPath(@"(//div[@id='bodyContent']//p[not(@class)])[1]");
if (res == null)
return null;
return Regex.Replace(res, @"\[\d+\]", "");
Expand Down

0 comments on commit f7bb0e5

Please sign in to comment.