From f7bb0e50d8cf54b5ba6ddd46d7bb41013331948b Mon Sep 17 00:00:00 2001 From: Henry Lin Date: Fri, 25 Feb 2022 14:37:36 +0800 Subject: [PATCH] Updates to web scraping --- .../Dictionaries/CambridgeEnglishChineseDictionary.cs | 4 ++-- QuickDictionary/Models/Dictionaries/DictionaryDotCom.cs | 6 +++--- .../Models/Dictionaries/GoogleSearchDictionary.cs | 9 +++++---- .../Models/Dictionaries/GoogleTranslateDictionary.cs | 2 +- .../Dictionaries/MerriamWebsterMedicalDictionary.cs | 4 ++-- .../Dictionaries/OxfordAdvancedLearnersDictionary.cs | 6 +++--- .../Models/Dictionaries/WikipediaDictionary.cs | 4 ++-- 7 files changed, 18 insertions(+), 17 deletions(-) diff --git a/QuickDictionary/Models/Dictionaries/CambridgeEnglishChineseDictionary.cs b/QuickDictionary/Models/Dictionaries/CambridgeEnglishChineseDictionary.cs index f2cc99a..93f94ee 100644 --- a/QuickDictionary/Models/Dictionaries/CambridgeEnglishChineseDictionary.cs +++ b/QuickDictionary/Models/Dictionaries/CambridgeEnglishChineseDictionary.cs @@ -19,7 +19,7 @@ public override async Task ValidateQueryAsync(string url, string word) public override async Task GetWordAsync(ChromiumWebBrowser browser) { - var headword = await browser.GetInnerTextByXPath(@"//span[contains(@class,""headword"")]"); + var headword = await browser.GetInnerTextByXPath(@"//span[contains(@class,'headword')]"); if (!string.IsNullOrWhiteSpace(headword)) return headword; headword = Regex.Match(browser.Address, @"dictionary\.cambridge\.org\/dictionary\/[\w-_]+\/([\w_-]+)").Groups[1].Value; @@ -27,7 +27,7 @@ public override async Task GetWordAsync(ChromiumWebBrowser browser) } public override async Task GetDescriptionAsync(ChromiumWebBrowser browser) - => await browser.GetInnerTextByXPath(@"//div[contains(@class,""def ddef_d"")]", @"//span[contains(@class,""trans dtrans"")]"); + => await browser.GetInnerTextByXPath(@"//div[contains(@class,'def ddef_d')]", @"//span[contains(@class,'trans dtrans')]"); public override PackIconKind Icon => PackIconKind.LetterCBox; diff --git a/QuickDictionary/Models/Dictionaries/DictionaryDotCom.cs b/QuickDictionary/Models/Dictionaries/DictionaryDotCom.cs index ebbdf48..8cd00a7 100644 --- a/QuickDictionary/Models/Dictionaries/DictionaryDotCom.cs +++ b/QuickDictionary/Models/Dictionaries/DictionaryDotCom.cs @@ -20,10 +20,10 @@ public override async Task ValidateQueryAsync(string url, string word) public override async Task GetWordAsync(ChromiumWebBrowser browser) { - var headword = await browser.GetInnerTextByXPath(@"//h1[@class=""css-1jzk4d9 e1rg2mtf8""]"); + var headword = await browser.GetInnerTextByXPath(@"//*[@data-first-headword='true']"); if (!string.IsNullOrWhiteSpace(headword)) return headword; - var match = Regex.Match(browser.Address, @"www\.dictionary\.com\/definition\/[\w-_]+\/([^?]+)"); + var match = Regex.Match(browser.Address, @"www\.dictionary\.com\/browse\/([^?]+)"); if (match.Success) { headword = WebUtility.UrlDecode(match.Groups[1].Value); @@ -34,7 +34,7 @@ public override async Task GetWordAsync(ChromiumWebBrowser browser) } public override async Task GetDescriptionAsync(ChromiumWebBrowser browser) - => await browser.GetInnerTextByXPath(@"//div[@class=""css-1ghs5zt e1q3nk1v3""]"); + => await browser.GetInnerTextByXPath(@"//*[contains(@class, 'one-click-content')]"); public override PackIconKind Icon => PackIconKind.LetterDBox; diff --git a/QuickDictionary/Models/Dictionaries/GoogleSearchDictionary.cs b/QuickDictionary/Models/Dictionaries/GoogleSearchDictionary.cs index 8d4f1d0..be96052 100644 --- a/QuickDictionary/Models/Dictionaries/GoogleSearchDictionary.cs +++ b/QuickDictionary/Models/Dictionaries/GoogleSearchDictionary.cs @@ -2,6 +2,7 @@ using System.Threading.Tasks; using CefSharp.Wpf; using MaterialDesignThemes.Wpf; +using QuickDictionary.Utils; namespace QuickDictionary.Models.Dictionaries; @@ -15,11 +16,11 @@ public override bool ValidateUrl(string url) public override Task ValidateQueryAsync(string url, string word) => Task.FromResult(true); // todo: try web-scraping google search - public override Task GetWordAsync(ChromiumWebBrowser browser) - => Task.FromResult(null); + public override async Task GetWordAsync(ChromiumWebBrowser browser) + => await browser.GetInnerTextByXPath(@"//*[@data-dobid='hdw']"); - public override Task GetDescriptionAsync(ChromiumWebBrowser browser) - => Task.FromResult(null); + public override async Task GetDescriptionAsync(ChromiumWebBrowser browser) + => await browser.GetInnerTextByXPath(@"//*[@data-dobid='dfn']"); public override PackIconKind Icon => PackIconKind.Google; diff --git a/QuickDictionary/Models/Dictionaries/GoogleTranslateDictionary.cs b/QuickDictionary/Models/Dictionaries/GoogleTranslateDictionary.cs index f7eca9c..de56423 100644 --- a/QuickDictionary/Models/Dictionaries/GoogleTranslateDictionary.cs +++ b/QuickDictionary/Models/Dictionaries/GoogleTranslateDictionary.cs @@ -20,7 +20,7 @@ public override Task GetWordAsync(ChromiumWebBrowser browser) => Task.FromResult(null); public override async Task GetDescriptionAsync(ChromiumWebBrowser browser) - => await browser.GetInnerTextByXPath(@"//div[contains(@class,""J0lOec"")]"); + => await browser.GetInnerTextByXPath(@"//div[contains(@class,'J0lOec')]"); public override PackIconKind Icon => PackIconKind.GoogleTranslate; diff --git a/QuickDictionary/Models/Dictionaries/MerriamWebsterMedicalDictionary.cs b/QuickDictionary/Models/Dictionaries/MerriamWebsterMedicalDictionary.cs index b0ceeec..2b27c63 100644 --- a/QuickDictionary/Models/Dictionaries/MerriamWebsterMedicalDictionary.cs +++ b/QuickDictionary/Models/Dictionaries/MerriamWebsterMedicalDictionary.cs @@ -33,7 +33,7 @@ public override async Task ValidateQueryAsync(string url, string word) public override async Task GetWordAsync(ChromiumWebBrowser browser) { - var headword = await browser.GetInnerTextByXPath(@"//h1[contains(@class,""hword"")]"); + var headword = await browser.GetInnerTextByXPath(@"//h1[contains(@class,'hword')]"); if (!string.IsNullOrWhiteSpace(headword)) return headword; headword = Regex.Match(browser.Address, @"www\.merriam-webster\.com\/[\w-_]+\/([\w_-]+)").Groups[1].Value; @@ -41,7 +41,7 @@ public override async Task GetWordAsync(ChromiumWebBrowser browser) } public override async Task GetDescriptionAsync(ChromiumWebBrowser browser) - => await browser.GetInnerTextByXPath(@"//span[@class=""dtText""]"); + => await browser.GetInnerTextByXPath(@"//span[@class='dtText']"); public override PackIconKind Icon => PackIconKind.MedicalBag; diff --git a/QuickDictionary/Models/Dictionaries/OxfordAdvancedLearnersDictionary.cs b/QuickDictionary/Models/Dictionaries/OxfordAdvancedLearnersDictionary.cs index 240cd9f..2229981 100644 --- a/QuickDictionary/Models/Dictionaries/OxfordAdvancedLearnersDictionary.cs +++ b/QuickDictionary/Models/Dictionaries/OxfordAdvancedLearnersDictionary.cs @@ -19,10 +19,10 @@ public override async Task ValidateQueryAsync(string url, string word) public override async Task GetWordAsync(ChromiumWebBrowser browser) { - var headword = await browser.GetInnerTextByXPath(@"//h1[@class=""headword""]"); + var headword = await browser.GetInnerTextByXPath(@"//h1[@class='headword']"); if (!string.IsNullOrWhiteSpace(headword)) return headword; - headword = await browser.GetInnerTextByXPath(@"//h2[@class=""h""]"); + headword = await browser.GetInnerTextByXPath(@"//h2[@class='h']"); if (!string.IsNullOrWhiteSpace(headword)) return headword; headword = Regex.Match(browser.Address, @"www\.oxfordlearnersdictionaries\.com\/definition\/[\w-_]+\/([\w_-]+)").Groups[1].Value; @@ -30,7 +30,7 @@ public override async Task GetWordAsync(ChromiumWebBrowser browser) } public override async Task GetDescriptionAsync(ChromiumWebBrowser browser) - => await browser.GetInnerTextByXPath(@"//span[@class=""def""]"); + => await browser.GetInnerTextByXPath(@"//span[@class='def']"); public override PackIconKind Icon => PackIconKind.LetterOBox; diff --git a/QuickDictionary/Models/Dictionaries/WikipediaDictionary.cs b/QuickDictionary/Models/Dictionaries/WikipediaDictionary.cs index 734f194..01e4912 100644 --- a/QuickDictionary/Models/Dictionaries/WikipediaDictionary.cs +++ b/QuickDictionary/Models/Dictionaries/WikipediaDictionary.cs @@ -20,7 +20,7 @@ public override async Task ValidateQueryAsync(string url, string word) public override async Task GetWordAsync(ChromiumWebBrowser browser) { - var headword = await browser.GetInnerTextByXPath(@"//div[@class=""page-heading""]"); + var headword = await browser.GetInnerTextByXPath(@"//div[@class='page-heading']"); if (!string.IsNullOrWhiteSpace(headword)) return headword; var match = Regex.Match(browser.Address, @"wikipedia\.org\/w\/index\.php\?title=([^&]+)"); @@ -42,7 +42,7 @@ public override async Task GetWordAsync(ChromiumWebBrowser browser) public override async Task GetDescriptionAsync(ChromiumWebBrowser browser) { - var res = await browser.GetInnerTextByXPath(@"(//div[@id=""bodyContent""]//p[not(@class)])[1]"); + var res = await browser.GetInnerTextByXPath(@"(//div[@id='bodyContent']//p[not(@class)])[1]"); if (res == null) return null; return Regex.Replace(res, @"\[\d+\]", "");