From 21e92e28fae980551eb6801189ef059fcc25c15e Mon Sep 17 00:00:00 2001 From: Terry Mancey Date: Sun, 29 May 2022 12:18:18 +0100 Subject: [PATCH] Classify search engine results page content --- components/brave_ads/test/BUILD.gn | 4 + vendor/bat-native-ads/BUILD.gn | 12 ++ .../src/bat/ads/internal/ads_impl.cc | 18 +- .../search_engine_domain_extensions_util.cc | 60 ++++++ .../search_engine_domain_extensions_util.h | 20 ++ .../ads/internal/base/search_engine_info.cc | 14 +- .../ads/internal/base/search_engine_info.h | 14 +- ...ch_engine_results_page_url_pattern_util.cc | 44 ++++ ...rch_engine_results_page_url_pattern_util.h | 21 ++ ..._results_page_url_pattern_util_unittest.cc | 99 +++++++++ .../base/search_engine_results_page_util.cc | 80 +++++++ .../base/search_engine_results_page_util.h | 23 ++ ...earch_engine_results_page_util_unittest.cc | 160 ++++++++++++++ .../base/search_engine_subdomains_util.cc | 36 ++++ .../base/search_engine_subdomains_util.h | 19 ++ .../base/search_engine_url_pattern_util.cc | 51 +++++ .../base/search_engine_url_pattern_util.h | 21 ++ ...search_engine_url_pattern_util_unittest.cc | 94 ++++++++ .../ads/internal/base/search_engine_util.cc | 197 ++--------------- .../ads/internal/base/search_engine_util.h | 4 - .../base/search_engine_util_unittest.cc | 201 ++++++++++++++++++ .../ads/internal/base/search_engines_util.cc | 61 ++++++ .../ads/internal/base/search_engines_util.h | 19 ++ .../src/bat/ads/internal/base/url_util.cc | 7 +- .../src/bat/ads/internal/base/url_util.h | 2 + .../sorts/conversions_ascending_sort.cc | 4 +- .../sorts/conversions_descending_sort.cc | 4 +- .../purchase_intent_processor.cc | 9 +- .../purchase_intent_processor_unittest.cc | 3 +- 29 files changed, 1085 insertions(+), 216 deletions(-) create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_domain_extensions_util.cc create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_domain_extensions_util.h create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_url_pattern_util.cc create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_url_pattern_util.h create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_url_pattern_util_unittest.cc create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_util.cc create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_util.h create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_util_unittest.cc create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_subdomains_util.cc create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_subdomains_util.h create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_url_pattern_util.cc create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_url_pattern_util.h create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_url_pattern_util_unittest.cc create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_util_unittest.cc create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engines_util.cc create mode 100644 vendor/bat-native-ads/src/bat/ads/internal/base/search_engines_util.h diff --git a/components/brave_ads/test/BUILD.gn b/components/brave_ads/test/BUILD.gn index bce1166fdcc5..b4f0395c87c7 100644 --- a/components/brave_ads/test/BUILD.gn +++ b/components/brave_ads/test/BUILD.gn @@ -96,6 +96,10 @@ source_set("brave_ads_unit_tests") { "//brave/vendor/bat-native-ads/src/bat/ads/internal/base/number_util_unittest.cc", "//brave/vendor/bat-native-ads/src/bat/ads/internal/base/platform_helper_mock.cc", "//brave/vendor/bat-native-ads/src/bat/ads/internal/base/platform_helper_mock.h", + "//brave/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_url_pattern_util_unittest.cc", + "//brave/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_util_unittest.cc", + "//brave/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_url_pattern_util_unittest.cc", + "//brave/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_util_unittest.cc", "//brave/vendor/bat-native-ads/src/bat/ads/internal/base/string_util_unittest.cc", "//brave/vendor/bat-native-ads/src/bat/ads/internal/base/subdivision_code_util_unittest.cc", "//brave/vendor/bat-native-ads/src/bat/ads/internal/base/time_constraint_util_unittest.cc", diff --git a/vendor/bat-native-ads/BUILD.gn b/vendor/bat-native-ads/BUILD.gn index 5cb09ba99089..a2f1cd4c0049 100644 --- a/vendor/bat-native-ads/BUILD.gn +++ b/vendor/bat-native-ads/BUILD.gn @@ -350,10 +350,22 @@ source_set("ads") { "src/bat/ads/internal/base/platform_helper.cc", "src/bat/ads/internal/base/platform_helper.h", "src/bat/ads/internal/base/platform_helper_types.h", + "src/bat/ads/internal/base/search_engine_domain_extensions_util.cc", + "src/bat/ads/internal/base/search_engine_domain_extensions_util.h", "src/bat/ads/internal/base/search_engine_info.cc", "src/bat/ads/internal/base/search_engine_info.h", + "src/bat/ads/internal/base/search_engine_results_page_url_pattern_util.cc", + "src/bat/ads/internal/base/search_engine_results_page_url_pattern_util.h", + "src/bat/ads/internal/base/search_engine_results_page_util.cc", + "src/bat/ads/internal/base/search_engine_results_page_util.h", + "src/bat/ads/internal/base/search_engine_subdomains_util.cc", + "src/bat/ads/internal/base/search_engine_subdomains_util.h", + "src/bat/ads/internal/base/search_engine_url_pattern_util.cc", + "src/bat/ads/internal/base/search_engine_url_pattern_util.h", "src/bat/ads/internal/base/search_engine_util.cc", "src/bat/ads/internal/base/search_engine_util.h", + "src/bat/ads/internal/base/search_engines_util.cc", + "src/bat/ads/internal/base/search_engines_util.h", "src/bat/ads/internal/base/string_util.cc", "src/bat/ads/internal/base/string_util.h", "src/bat/ads/internal/base/subdivision_code_util.cc", diff --git a/vendor/bat-native-ads/src/bat/ads/internal/ads_impl.cc b/vendor/bat-native-ads/src/bat/ads/internal/ads_impl.cc index 63a7f373b7c7..0c443f9dc314 100644 --- a/vendor/bat-native-ads/src/bat/ads/internal/ads_impl.cc +++ b/vendor/bat-native-ads/src/bat/ads/internal/ads_impl.cc @@ -27,6 +27,7 @@ #include "bat/ads/internal/ads_client_helper.h" #include "bat/ads/internal/base/logging_util.h" #include "bat/ads/internal/base/platform_helper.h" +#include "bat/ads/internal/base/search_engine_results_page_util.h" #include "bat/ads/internal/base/search_engine_util.h" #include "bat/ads/internal/base/string_util.h" #include "bat/ads/internal/base/time_formatting_util.h" @@ -174,12 +175,12 @@ void AdsImpl::OnHtmlLoaded(const int32_t tab_id, const uint32_t hash = base::FastHash(html); if (hash == last_html_loaded_hash_) { - BLOG(1, "HTML content has not changed"); return; } last_html_loaded_hash_ = hash; transfer_->MaybeTransferAd(tab_id, redirect_chain); + conversions_->MaybeConvert( redirect_chain, html, conversions_resource_->get()->conversion_id_patterns); @@ -196,7 +197,6 @@ void AdsImpl::OnTextLoaded(const int32_t tab_id, const uint32_t hash = base::FastHash(text); if (hash == last_text_loaded_hash_) { - BLOG(1, "Text content has not changed"); return; } last_text_loaded_hash_ = hash; @@ -204,7 +204,7 @@ void AdsImpl::OnTextLoaded(const int32_t tab_id, const GURL& url = redirect_chain.back(); if (!url.SchemeIsHTTPOrHTTPS()) { - BLOG(1, "Visited URL is not supported"); + BLOG(1, url.scheme() << " scheme is not supported for text content"); return; } @@ -215,12 +215,14 @@ void AdsImpl::OnTextLoaded(const int32_t tab_id, purchase_intent_processor_->Process(url); } - if (IsSearchEngine(url)) { - BLOG(1, "Search engine pages are not supported for text classification"); - } else { - const std::string stripped_text = StripNonAlphaCharacters(text); - text_classification_processor_->Process(stripped_text); + if (IsSearchEngine(url) && !IsSearchEngineResultsPage(url)) { + BLOG(1, + "Search engine landing page is not supported for text classification"); + return; } + + const std::string stripped_text = StripNonAlphaCharacters(text); + text_classification_processor_->Process(stripped_text); } void AdsImpl::OnUserGesture(const int32_t page_transition_type) { diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_domain_extensions_util.cc b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_domain_extensions_util.cc new file mode 100644 index 000000000000..1150cdb1841e --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_domain_extensions_util.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "bat/ads/internal/base/search_engine_domain_extensions_util.h" + +#include "base/no_destructor.h" + +namespace ads { + +const std::vector& GetAmazonSearchEngineDomainExtensions() { + // See https://www.amazon.com/gp/navigation-country/select-country. + static base::NoDestructor> extensions( + {"ae", "ca", "cn", "co.jp", "co.uk", "com", "com.au", + "com.br", "com.mx", "de", "eg", "es", "fr", "in", + "it", "nl", "pl", "sa", "se", "sp", "tr"}); + return *extensions; +} + +const std::vector& GetGoogleSearchEngineDomainExtensions() { + static base::NoDestructor> extensions( + {"ac", "ad", "ae", "al", "am", "as", "at", + "az", "ba", "be", "bf", "bg", "bi", "bj", + "bs", "bt", "ca", "cat", "cd", "cf", "cg", + "ch", "ci", "ci", "cl", "cm", "cn", "co.bw", + "co.ck", "co.cr", "co.id", "co.il", "co.im", "co.in", "co.je", + "co.jp", "co.ke", "co.kr", "co.ls", "co.ma", "co.mz", "co.nz", + "co.th", "co.tz", "co.ug", "co.uk", "co.uz", "co.ve", "co.vi", + "co.za", "co.zm", "co.zw", "com.af", "com.ag", "com.ai", "com.ar", + "com.au", "com.bd", "com.bh", "com.bn", "com.bo", "com.br", "com.by", + "com.bz", "com.co", "com.cu", "com.cy", "com.do", "com.ec", "com.eg", + "com.et", "com.fj", "com.gh", "com.gi", "com.gt", "com.hk", "com.jm", + "com.kg", "com.kh", "com.kw", "com.lb", "com.ly", "com.mt", "com.mx", + "com.my", "com.na", "com.nf", "com.ng", "com.ni", "com.np", "com.om", + "com.pa", "com.pe", "com.pg", "com.ph", "com.pk", "com.pr", "com.py", + "com.qa", "com.sa", "com.sb", "com.sg", "com.sl", "com.sv", "com.tj", + "com.tr", "com.tw", "com.ua", "com.uy", "com.vc", "com.vn", "com", + "cv", "cz", "de", "dj", "dk", "dm", "dz", + "ee", "es", "fi", "fm", "fr", "ga", "ge", + "gg", "gl", "gm", "gp", "gr", "gy", "hn", + "hr", "ht", "hu", "ie", "iq", "is", "it.ao", + "it", "jo", "ki", "kz", "la", "li", "lk", + "lt", "lu", "lv", "md", "me", "mg", "mk", + "ml", "mn", "ms", "mu", "mv", "mw", "ne", + "nl", "no", "nr", "nu", "pl", "pn", "ps", + "pt", "ro", "rs", "ru", "rw", "sc", "se", + "sh", "si", "sk", "sm", "sn", "so", "sr", + "st", "td", "tg", "tk", "tl", "tm", "tn", + "to", "tt", "vg", "vu", "ws"}); + return *extensions; +} + +const std::vector& GetMojeekSearchEngineDomainExtensions() { + static base::NoDestructor> extensions( + {"co.uk", "com"}); + return *extensions; +} + +} // namespace ads diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_domain_extensions_util.h b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_domain_extensions_util.h new file mode 100644 index 000000000000..2737efafee4e --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_domain_extensions_util.h @@ -0,0 +1,20 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_DOMAIN_EXTENSIONS_UTIL_H_ +#define BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_DOMAIN_EXTENSIONS_UTIL_H_ + +#include +#include + +namespace ads { + +const std::vector& GetAmazonSearchEngineDomainExtensions(); +const std::vector& GetGoogleSearchEngineDomainExtensions(); +const std::vector& GetMojeekSearchEngineDomainExtensions(); + +} // namespace ads + +#endif // BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_DOMAIN_EXTENSIONS_UTIL_H_ diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_info.cc b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_info.cc index 8820a975968a..328b5b818a20 100644 --- a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_info.cc +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_info.cc @@ -9,14 +9,12 @@ namespace ads { SearchEngineInfo::SearchEngineInfo() = default; -SearchEngineInfo::SearchEngineInfo(const std::string& name, - const std::string& hostname, - const std::string& query, - bool is_always_classed_as_a_search) - : name(name), - hostname(hostname), - query(query), - is_always_classed_as_a_search(is_always_classed_as_a_search) {} +SearchEngineInfo::SearchEngineInfo(const std::string& url_pattern, + const std::string& result_page_url_pattern, + const std::string& search_term_query_key) + : url_pattern(url_pattern), + result_page_url_pattern(result_page_url_pattern), + search_term_query_key(search_term_query_key) {} SearchEngineInfo::SearchEngineInfo(const SearchEngineInfo& info) = default; diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_info.h b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_info.h index 62e2721e0da6..4f19abb1955c 100644 --- a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_info.h +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_info.h @@ -13,17 +13,15 @@ namespace ads { struct SearchEngineInfo final { public: SearchEngineInfo(); - SearchEngineInfo(const std::string& name, - const std::string& hostname, - const std::string& query, - bool is_always_classed_as_a_search); + SearchEngineInfo(const std::string& url_pattern, + const std::string& result_page_url_pattern, + const std::string& search_term_query_key); SearchEngineInfo(const SearchEngineInfo& info); ~SearchEngineInfo(); - std::string name; - std::string hostname; - std::string query; - bool is_always_classed_as_a_search = false; + std::string url_pattern; + std::string result_page_url_pattern; + std::string search_term_query_key; }; } // namespace ads diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_url_pattern_util.cc b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_url_pattern_util.cc new file mode 100644 index 000000000000..22435ac465ef --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_url_pattern_util.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "bat/ads/internal/base/search_engine_results_page_url_pattern_util.h" + +#include "base/no_destructor.h" +#include "base/strings/strcat.h" +#include "bat/ads/internal/base/search_engine_url_pattern_util.h" + +namespace ads { + +const std::string& GetAmazonResultsPageUrlPattern() { + static base::NoDestructor url_pattern( + base::StrCat({GetAmazonUrlPattern(), "s"})); + return *url_pattern; +} + +const std::string& GetGoogleResultsPageUrlPattern() { + static base::NoDestructor url_pattern( + base::StrCat({GetGoogleUrlPattern(), "search"})); + return *url_pattern; +} + +const std::string& GetMojeekResultsPageUrlPattern() { + static base::NoDestructor url_pattern( + base::StrCat({GetMojeekUrlPattern(), "search"})); + return *url_pattern; +} + +const std::string& GetWikipediaResultsPageUrlPattern() { + static base::NoDestructor url_pattern( + base::StrCat({GetWikipediaUrlPattern(), "wiki/(.*)"})); + return *url_pattern; +} + +const std::string& GetYahooResultsPageUrlPattern() { + static base::NoDestructor url_pattern( + base::StrCat({GetYahooUrlPattern(), "search(.*)"})); + return *url_pattern; +} + +} // namespace ads diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_url_pattern_util.h b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_url_pattern_util.h new file mode 100644 index 000000000000..b5ddf8fe44d2 --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_url_pattern_util.h @@ -0,0 +1,21 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_RESULTS_PAGE_URL_PATTERN_UTIL_H_ +#define BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_RESULTS_PAGE_URL_PATTERN_UTIL_H_ + +#include + +namespace ads { + +const std::string& GetAmazonResultsPageUrlPattern(); +const std::string& GetGoogleResultsPageUrlPattern(); +const std::string& GetMojeekResultsPageUrlPattern(); +const std::string& GetWikipediaResultsPageUrlPattern(); +const std::string& GetYahooResultsPageUrlPattern(); + +} // namespace ads + +#endif // BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_RESULTS_PAGE_URL_PATTERN_UTIL_H_ diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_url_pattern_util_unittest.cc b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_url_pattern_util_unittest.cc new file mode 100644 index 000000000000..0c9465560996 --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_url_pattern_util_unittest.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "bat/ads/internal/base/search_engine_results_page_url_pattern_util.h" + +#include "testing/gtest/include/gtest/gtest.h" + +// npm run test -- brave_unit_tests --filter=BatAds* + +namespace ads { + +TEST(BatAdsSearchEngineResultsPageUrlPatternUtilTest, + GetAmazonResultsPageUrlPattern) { + // Arrange + + // Act + const std::string& url_pattern = GetAmazonResultsPageUrlPattern(); + + // Assert + EXPECT_EQ( + "https://" + "www.amazon.(ae|ca|cn|co.jp|co.uk|com|com.au|com.br|com.mx|de|eg|es|fr|" + "in|it|nl|pl|sa|se|sp|tr)/s", + url_pattern); +} + +TEST(BatAdsSearchEngineResultsPageUrlPatternUtilTest, + GetGoogleResultsPageUrlPattern) { + // Arrange + + // Act + const std::string& url_pattern = GetGoogleResultsPageUrlPattern(); + + // Assert + EXPECT_EQ( + "https://" + "www.google.(ac|ad|ae|al|am|as|at|az|ba|be|bf|bg|bi|bj|bs|bt|ca|cat|cd|" + "cf|cg|ch|ci|ci|cl|cm|cn|co.bw|co.ck|co.cr|co.id|co.il|co.im|co.in|co.je|" + "co.jp|co.ke|co.kr|co.ls|co.ma|co.mz|co.nz|co.th|co.tz|co.ug|co.uk|co.uz|" + "co.ve|co.vi|co.za|co.zm|co.zw|com.af|com.ag|com.ai|com.ar|com.au|com.bd|" + "com.bh|com.bn|com.bo|com.br|com.by|com.bz|com.co|com.cu|com.cy|com.do|" + "com.ec|com.eg|com.et|com.fj|com.gh|com.gi|com.gt|com.hk|com.jm|com.kg|" + "com.kh|com.kw|com.lb|com.ly|com.mt|com.mx|com.my|com.na|com.nf|com.ng|" + "com.ni|com.np|com.om|com.pa|com.pe|com.pg|com.ph|com.pk|com.pr|com.py|" + "com.qa|com.sa|com.sb|com.sg|com.sl|com.sv|com.tj|com.tr|com.tw|com.ua|" + "com.uy|com.vc|com.vn|com|cv|cz|de|dj|dk|dm|dz|ee|es|fi|fm|fr|ga|ge|gg|" + "gl|gm|gp|gr|gy|hn|hr|ht|hu|ie|iq|is|it.ao|it|jo|ki|kz|la|li|lk|lt|lu|lv|" + "md|me|mg|mk|ml|mn|ms|mu|mv|mw|ne|nl|no|nr|nu|pl|pn|ps|pt|ro|rs|ru|rw|sc|" + "se|sh|si|sk|sm|sn|so|sr|st|td|tg|tk|tl|tm|tn|to|tt|vg|vu|ws)/search", + url_pattern); +} + +TEST(BatAdsSearchEngineResultsPageUrlPatternUtilTest, + GetMojeekResultsPageUrlPattern) { + // Arrange + + // Act + const std::string& url_pattern = GetMojeekResultsPageUrlPattern(); + + // Assert + EXPECT_EQ("https://www.mojeek.(co.uk|com)/search", url_pattern); +} + +TEST(BatAdsSearchEngineResultsPageUrlPatternUtilTest, + GetWikipediaResultsPageUrlPattern) { + // Arrange + + // Act + const std::string& url_pattern = GetWikipediaResultsPageUrlPattern(); + + // Assert + EXPECT_EQ( + "https://" + "(af|ar|arz|ast|az|azb|be|bg|bn|ca|ce|ceb|cs|cy|da|de|el|en|eo|es|et|eu|" + "fa|fi|fr|gl|he|hi|hr|hu|hy|id|it|ja|ka|kk|ko|la|lt|lv|min|mk|ms|my|nan|" + "nl|nn|no|pl|pt|ro|ru|sh|simple|sk|sl|sr|sv|ta|tg|th|tr|tt|uk|ur|uz|vi|" + "vo|war|zh|zh-yue).wikipedia.org/wiki/(.*)", + url_pattern); +} + +TEST(BatAdsSearchEngineResultsPageUrlPatternUtilTest, + GetYahooResultsPageUrlPattern) { + // Arrange + + // Act + const std::string& url_pattern = GetYahooResultsPageUrlPattern(); + + // Assert + EXPECT_EQ( + "https://" + "((au|be|br|ca|de|en-maktoob|es|espanol|fr|fr-be|gr|hk|id|ie|in|it|" + "malaysia|nz|ph|qc|ro|se|sg|tw|uk|vn|www|za).search.yahoo.com/" + "|search.yahoo.com/)search(.*)", + url_pattern); +} + +} // namespace ads diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_util.cc b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_util.cc new file mode 100644 index 000000000000..167ffb2119b7 --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_util.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "bat/ads/internal/base/search_engine_results_page_util.h" + +#include + +#include "bat/ads/internal/base/search_engine_info.h" +#include "bat/ads/internal/base/search_engines_util.h" +#include "bat/ads/internal/base/url_util.h" +#include "net/base/url_util.h" +#include "third_party/re2/src/re2/re2.h" +#include "url/gurl.h" + +namespace ads { + +namespace { + +absl::optional FindSearchEngineResultsPage(const GURL& url) { + if (!url.is_valid()) { + return absl::nullopt; + } + + const GURL url_with_empty_query = GetUrlWithEmptyQuery(url); + const std::vector& search_engines = GetSearchEngines(); + for (const auto& search_engine : search_engines) { + if (RE2::FullMatch(url_with_empty_query.spec(), + search_engine.result_page_url_pattern)) { + return search_engine; + } + } + + return absl::nullopt; +} + +} // namespace + +bool IsSearchEngineResultsPage(const GURL& url) { + const absl::optional search_engine_optional = + FindSearchEngineResultsPage(url); + if (!search_engine_optional) { + return false; + } + const SearchEngineInfo& search_engine = search_engine_optional.value(); + + if (search_engine.search_term_query_key.empty()) { + // We should only match |result_page_url_pattern| if the search engine does + // not have a search term query key + return true; + } + + std::string search_term_query_value; + if (!net::GetValueForKeyInQuery(url, search_engine.search_term_query_key, + &search_term_query_value)) { + return false; + } + + return true; +} + +absl::optional ExtractSearchTermQueryValue(const GURL& url) { + const absl::optional search_engine_optional = + FindSearchEngineResultsPage(url); + if (!search_engine_optional) { + return absl::nullopt; + } + const SearchEngineInfo& search_engine = search_engine_optional.value(); + + std::string search_term_query_value; + if (!net::GetValueForKeyInQuery(url, search_engine.search_term_query_key, + &search_term_query_value)) { + return absl::nullopt; + } + + return search_term_query_value; +} + +} // namespace ads diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_util.h b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_util.h new file mode 100644 index 000000000000..1584e3ced210 --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_util.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_RESULTS_PAGE_UTIL_H_ +#define BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_RESULTS_PAGE_UTIL_H_ + +#include + +#include "third_party/abseil-cpp/absl/types/optional.h" + +class GURL; + +namespace ads { + +bool IsSearchEngineResultsPage(const GURL& url); + +absl::optional ExtractSearchTermQueryValue(const GURL& url); + +} // namespace ads + +#endif // BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_RESULTS_PAGE_UTIL_H_ diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_util_unittest.cc b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_util_unittest.cc new file mode 100644 index 000000000000..e0f0478dff66 --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_util_unittest.cc @@ -0,0 +1,160 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "bat/ads/internal/base/search_engine_results_page_util.h" + +#include + +#include "base/no_destructor.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl.h" + +// npm run test -- brave_unit_tests --filter=BatAds* + +namespace ads { + +namespace { + +const std::vector& GetSearchEngineResultsPageUrls() { + // When adding new search engines you should perform a search for |foobar| and + // copy the complete URL from the address bar. + static base:: + NoDestructor> + urls( + {GURL(R"(https://developer.mozilla.org/en-US/search?q=foobar)"), + GURL(R"(https://duckduckgo.com/?q=foobar&t=h_&ia=web)"), + GURL(R"(https://en.wikipedia.org/wiki/Foobar)"), + GURL(R"(https://fireball.de/search?q=foobar)"), + GURL(R"(https://github.com/search?q=foobar)"), + GURL(R"(https://infogalactic.com/info/Foobar)"), + GURL(R"(https://ja.wikipedia.org/wiki/Foobar)"), + GURL(R"(https://results.excite.com/serp?q=foobar)"), + GURL(R"(https://search.brave.com/search?q=foobar&source=web)"), + GURL( + R"(https://search.lycos.com/web/?q=foobar&keyvol=00eba27cf23332982690&_gl=1%2Aaqbo9y%2A_ga%2AMzk3NjM2MDcxLjE2NTM5MjY5NDQ.%2A_ga_76FJGHQNN6%2AMTY1MzkyNjk0NC4xLjEuMTY1MzkyNjk4MC4w)"), + GURL( + R"(https://search.yahoo.com/search;_ylt=AwrE19xR4pRi4HkAbx9DDWVH;_ylc=X1MDMTE5NzgwNDg2NwRfcgMyBGZyAwRmcjIDcDpzLHY6c2ZwLG06c2ItdG9wBGdwcmlkA245V3NRQnh5U1lHeW5haWhGdGp6X0EEbl9yc2x0AzAEbl9zdWdnAzEwBG9yaWdpbgNzZWFyY2gueWFob28uY29tBHBvcwMwBHBxc3RyAwRwcXN0cmwDMARxc3RybAM2BHF1ZXJ5A2Zvb2JhcgR0X3N0bXADMTY1MzkyNDQ0Ng--?p=foobar&fr=sfp&fr2=p%3As%2Cv%3Asfp%2Cm%3Asb-top&iscqry=)"), + GURL( + R"(https://stackoverflow.com/search?q=foobar&s=2cacbef4-4b9e-4b96-a9ed-cdaf97f26dac)"), + GURL(R"(https://swisscows.com/web?query=foobar)"), + GURL(R"(https://twitter.com/search?q=foobar&src=typed_query)"), + GURL( + R"(https://uk.search.yahoo.com/search?p=foobar&fr=yfp-t&fr2=p%3Afp%2Cm%3Asb&ei=UTF-8&fp=1)"), + GURL(R"(https://www.amazon.co.uk/s?k=foobar&ref=nb_sb_noss)"), + GURL( + R"(https://www.amazon.com/s?k=foobar&crid=2RGPVS512O6MC&sprefix=fo%2Caps%2C303&ref=nb_sb_noss_2)"), + GURL( + R"(https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=foobar&fenlei=256&rsv_pq=a98c990d00067b3a&rsv_t=e924%2F92qfHeGEe9hHP3joPcNEeV7qqhMrWfS8KWl7qxdom3iP3CPaNk5ozg1&rqlang=en&rsv_enter=1&rsv_dl=tb&rsv_sug3=3&rsv_sug1=1&rsv_sug7=100&rsv_sug2=0&rsv_btype=i&prefixsug=foobar&rsp=7&inputT=2637&rsv_sug4=2638&rsv_sug=1)"), + GURL( + R"(https://www.bing.com/search?q=foobar&form=QBLH&sp=-1&pq=&sc=8-0&qs=n&sk=&cvid=0025E271E6E849BAA97EB176045A1ACB)"), + GURL( + R"(https://www.dogpile.com/serp?q=foobar&sc=7jgbppidxNxC20)"), + GURL(R"(https://www.ecosia.org/search?method=index&q=foobar)"), + GURL(R"(https://www.findx.com/search?q=foobar)"), + GURL( + R"(https://www.gigablast.com/search?c=main&qlangcountry=en-us&q=foobar)"), + GURL( + R"(https://www.google.co.uk/search?q=foobar&source=hp&ei=tPSUYoO2Dc7GgQaXj4rAAg&iflsig=AJiK0e8AAAAAYpUCxCNJHkuwNPLoN3BeGzkEJeb4zUxX&ved=0ahUKEwiD4IuX1of4AhVOY8AKHZeHAigQ4dUDCAo&uact=5&oq=foobar&gs_lcp=Cgdnd3Mtd2l6EAMyBQgAEIAEMgUIABCABDIKCAAQsQMQgwEQCjIKCAAQsQMQgwEQCjIHCAAQsQMQCjIHCAAQsQMQCjIKCAAQsQMQgwEQCjIFCAAQgAQyCggAELEDEIMBEAoyBQgAEIAEOg4IABDqAhC0AhDZAhDlAjoRCC4QgAQQsQMQgwEQxwEQ0QM6CwguEIAEELEDEIMBOgsIABCABBCxAxCDAToICC4QsQMQgwE6CAgAELEDEIMBOg4ILhCABBCxAxCDARDUAjoICAAQgAQQsQM6EQguEIAEELEDEIMBEMcBEKMCOggILhCABBCxAzoICAAQgAQQyQM6BQgAEJIDOgUILhCABFD5A1jXGGClGmgEcAB4AYAB-gSIAeYOkgELMi4xLjEuMS4xLjGYAQCgAQGwAQg&sclient=gws-wiz)"), + GURL( + R"(https://www.google.com/search?q=foobar&source=hp&ei=a_SUYqPWM8zVgQat0paQAw&iflsig=AJiK0e8AAAAAYpUCewOIHMGsHZw9I0JyvAp36Vr8ebqy&ved=0ahUKEwjjt8r01Yf4AhXMasAKHS2pBTIQ4dUDCAo&uact=5&oq=foobar&gs_lcp=Cgdnd3Mtd2l6EAMyCAgAEIAEELEDMggIABCABBCxAzIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEOg4IABCPARDqAhCMAxDlAjoRCC4QgAQQsQMQgwEQxwEQ0QM6CwguEIAEELEDEIMBOgsIABCABBCxAxCDAToICC4QsQMQgwE6CAgAELEDEIMBOg4ILhCABBCxAxCDARDUAjoRCC4QgAQQsQMQgwEQxwEQowI6CAguEIAEELEDOgsILhCABBDHARCvAToICAAQgAQQyQM6BQgAEJIDOgoIABCxAxCDARAKOgcIABCxAxAKOgUILhCABFBdWOYLYOoNaAJwAHgAgAHzAYgB5AaSAQU0LjIuMZgBAKABAbABCg&sclient=gws-wiz)"), + GURL( + R"(https://www.metacrawler.com/serp?q=foobar&sc=1txEVm9N438G20)"), + GURL(R"(https://www.mojeek.co.uk/search?q=foobar)"), + GURL(R"(https://www.mojeek.com/search?q=foobar)"), + GURL( + R"(https://www.petalsearch.com/search?query=foobar&channel=all&from=PCweb&ps=10&pn=1&sid=s26awog2d63p2eryj0j1t4jtde5bxl0l&qs=1&page_start=0)"), + GURL(R"(https://www.qwant.com/?q=foobar&t=web)"), + GURL( + R"(https://www.semanticscholar.org/search?q=foobar&sort=relevance)"), + GURL( + R"(https://www.sogou.com/web?query=foobar&_asf=www.sogou.com&_ast=&w=01019900&p=40040100&ie=utf8&from=index-nologin&s_from=index&sut=866&sst0=1653927358545&lkt=6%2C1653927357679%2C1653927358446&sugsuv=1653927332873722&sugtime=1653927358545)"), + GURL(R"(https://www.startpage.com/sp/search)"), + GURL( + R"(https://www.webcrawler.com/serp?q=foobar&sc=MQ_doqXGq3EycDL_UUfFwdc46L4OQAEeXLiJA7JSoqit01lROq--mxI5cErzOAUign5nlzOVX8I3S-g7vV1NVTlsVyPA0jnDnTu6cDXzmNBDI6QfiwhNOVVTbHw1RpzacIMvLERSIpdyBa6G-ES_1ZkCT68FZXhd-sz_3nyPwdkjEjZzPS6SxWzXrzua0JtPxbhJYLEHouYbxUBEFEfpeaLpuIWgDd-47PqBnopwGjBmnkaFZ0sIG8HkZHimpKRprWJyZ2jUIKSw-yA6Os-MQo7T1hbHQBmlt1ZwZA5vadVGaNAaQ2bdKgGzSPHHIvq8czhoFgRcU0zUVqbPaI4Ak-3mo9J_K8aYAtHorJCeYw_c23BSudWciKLNTEXnm2HdiXwxrhmaSvYgq2dhndpJ6airnmBaqrW6kyri_RBHsda6GhEWKwdL6Z99Q0mPIgEKVxbEa4vIQyHgWnLSuqBsPyLrymO4pzM0YLI8A54orUHQBY-JTxtf6NC-2Sy3GfpXX0BFeqj793Cgy7SZBLBu6xfxnlB2Expj0tWoXJN_RQtQAz3MN6HlldHkx-vgOnn_92hQjPjRum8ICykuvlKpcclRpu-c1rjpS2PI4cUU-aLVIV4X0HyarYK3bcj1ntidbV12AymDCQE2wiTIw3jQHHr3mNDVimR5jWc-S4RxjoJIOlE0kg2ZNiAcsIG3Iycz7627WZdgBFtT6dVIrrpVnziIspj0Ykt_K4RZmKBOhGbv4f9ko0RJIy4ZSq1q4m9NXtWYbWFEav00TPlHhglmiX4DRGvCca8pvvdFo12XTdeKJdAATQHM2efrbKf8EsDgIiR3ithjV1len4JYAbzoK7OdyfRw81ACjHJXhIprnQRRZDrJHRGuVceUDb4FK3m2Y7HNNffVBoGn40greWpbA9FZLZImHoUxTeXOVzAp47qOO6-ExMlehN5iNV2d6aWqQx6SubKPC6t80SKhzB54qFbzbTtSjFs30ZztUVGFshJRBuG6he_JAZDNFdhEZRaLAkOTSp51HAXzCkovTMwiXoF_pjzCjsqzhRd0K05Wfh6eZap2ijQ8_V9rP0HwXJCDnF26daFm-Td4XK906t5WYIbum-J2eJGZ0aZ3lL71_mJzYTJW-QaToDlmikWoBTfb5Pfu0fAjeL2n11AJmAHuMFiTwisQrwRWIT8B2dx-ObXnkiVQa-jP2FCMJE7F4XvveyDeBSVLiPgIZYhQOry73fNzenDtdySAfge9kC4ypr9hxEvjfQzGORKUlDJABC0yyBokSFA8fng_5401iG66BkqIDe6idz-7C6MvsEbm3_l0vETzOLoQOLCnCxCICUV74BYZGbk_U4gQGouUp_aeUfRr8iAeB94CUhuLYVNzXxjIubOpWeLZBLcEo67eHPlMT6BvIbqpYudl1KeKRUP_FtYZx6sIvhL7pKRWDyxm3jewsAoLTO_cUv3thVBPfZEQSrkOYtVzrDr4ZGT3jmLS8W0iWOZM-h5vWvoQ0Wmb0fq15_twlzrVRjuOAGJ_2qMZhonmx0mfdXcQxzz9j39zJenVrLDDsEgLnEcxn2ObCYrSrNnCOu3EuwqaPRGmkf38pANr3ZUnFQAwSl8GF84pqjADQ8lDfsc-xu8RjbmnE0GZwaHHVh98lkmN_cFXS1cpGDByli6NVjE26H2VqTsFOn-3Lenc_nnWyE9srd1jmncLSFu70WxZ0pTCIsAlU_44PkmAxqOWABL61n73i-dc7OZWARzYH-YZ_57fR0IQT3uvxv8J_ZPUSsHCBtmAmIAwRhDbhzNFWh7K4C1SOIV9Mkvny0hjcowxy39zju20R1RzKDiAnQMF0vMYSmjkqynQfd4TEP9LKPogM3gJIafVBLtYisp-E23t0WZpx9WXWDLl)"), + GURL(R"(https://www.wolframalpha.com/input?i=foobar)"), + GURL(R"(https://www.youtube.com/results?search_query=foobar)"), + GURL(R"(https://yandex.com/search/?text=foobar&lr=104993)")}); + return *urls; +} + +} // namespace + +TEST(BatAdsSearchEngineUtilTest, IsSearchEngineResultsPage) { + // Arrange + const std::vector& urls = GetSearchEngineResultsPageUrls(); + + // Act + for (const auto& url : urls) { + EXPECT_TRUE(IsSearchEngineResultsPage(url)); + } + + // Assert +} + +TEST(BatAdsSearchEngineUtilTest, IsNotSearchEngineResultsPage) { + // Arrange + const GURL url = GURL("https://brave.com/"); + + // Act + const bool is_search_engine_result_page = IsSearchEngineResultsPage(url); + + // Assert + EXPECT_FALSE(is_search_engine_result_page); +} + +TEST(BatAdsSearchEngineUtilTest, IsNotSearchEngineResultsPageWithInvalidUrl) { + // Arrange + const GURL url = GURL("INVALID_URL"); + + // Act + const bool is_search_engine_result_page = IsSearchEngineResultsPage(url); + + // Assert + EXPECT_FALSE(is_search_engine_result_page); +} + +TEST(BatAdsSearchEngineUtilTest, ExtractSearchTermQueryValue) { + // Arrange + const std::vector& urls = GetSearchEngineResultsPageUrls(); + + // Act + for (const auto& url : urls) { + const absl::optional search_term_query_value_optional = + ExtractSearchTermQueryValue(url); + if (search_term_query_value_optional) { + EXPECT_EQ("foobar", search_term_query_value_optional.value()); + } + } + + // Assert +} + +TEST(BatAdsSearchEngineUtilTest, + FailToExtractSearchTermQueryValueFromUrlWithMissingQuery) { + // Arrange + const GURL url = GURL("https://google.com/"); + + // Act + const absl::optional search_term_query_value_optional = + ExtractSearchTermQueryValue(url); + + // Assert + EXPECT_FALSE(search_term_query_value_optional); +} + +TEST(BatAdsSearchEngineUtilTest, + FailToExtractSearchTermQueryValueFromInvalidUrl) { + // Arrange + const GURL url = GURL("INVALID_URL"); + + // Act + const absl::optional search_term_query_value_optional = + ExtractSearchTermQueryValue(url); + + // Assert + EXPECT_FALSE(search_term_query_value_optional); +} + +} // namespace ads diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_subdomains_util.cc b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_subdomains_util.cc new file mode 100644 index 000000000000..96e6051bb717 --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_subdomains_util.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "bat/ads/internal/base/search_engine_subdomains_util.h" + +#include "base/no_destructor.h" + +namespace ads { + +const std::vector& GetWikipediaSearchEngineSubdomains() { + // See https://www.wikipedia.org/. + static base::NoDestructor> extensions( + {"af", "ar", "arz", "ast", "az", "azb", "be", "bg", "bn", + "ca", "ce", "ceb", "cs", "cy", "da", "de", "el", "en", + "eo", "es", "et", "eu", "fa", "fi", "fr", "gl", "he", + "hi", "hr", "hu", "hy", "id", "it", "ja", "ka", "kk", + "ko", "la", "lt", "lv", "min", "mk", "ms", "my", "nan", + "nl", "nn", "no", "pl", "pt", "ro", "ru", "sh", "simple", + "sk", "sl", "sr", "sv", "ta", "tg", "th", "tr", "tt", + "uk", "ur", "uz", "vi", "vo", "war", "zh", "zh-yue"}); + return *extensions; +} + +const std::vector& GetYahooSearchEngineSubdomains() { + // See https://uk.yahoo.com/everything/world. + static base::NoDestructor> extensions( + {"au", "be", "br", "ca", "de", "en-maktoob", "es", + "espanol", "fr", "fr-be", "gr", "hk", "id", "ie", + "in", "it", "malaysia", "nz", "ph", "qc", "ro", + "se", "sg", "tw", "uk", "vn", "www", "za"}); + return *extensions; +} + +} // namespace ads diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_subdomains_util.h b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_subdomains_util.h new file mode 100644 index 000000000000..2cd748a4aa81 --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_subdomains_util.h @@ -0,0 +1,19 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_SUBDOMAINS_UTIL_H_ +#define BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_SUBDOMAINS_UTIL_H_ + +#include +#include + +namespace ads { + +const std::vector& GetWikipediaSearchEngineSubdomains(); +const std::vector& GetYahooSearchEngineSubdomains(); + +} // namespace ads + +#endif // BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_SUBDOMAINS_UTIL_H_ diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_url_pattern_util.cc b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_url_pattern_util.cc new file mode 100644 index 000000000000..480306e6d44f --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_url_pattern_util.cc @@ -0,0 +1,51 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "bat/ads/internal/base/search_engine_url_pattern_util.h" + +#include "base/no_destructor.h" +#include "base/strings/strcat.h" +#include "base/strings/string_util.h" +#include "bat/ads/internal/base/search_engine_domain_extensions_util.h" +#include "bat/ads/internal/base/search_engine_subdomains_util.h" + +namespace ads { + +const std::string& GetAmazonUrlPattern() { + static base::NoDestructor url_pattern(base::StrCat( + {"https://www.amazon.(", + base::JoinString(GetAmazonSearchEngineDomainExtensions(), "|"), ")/"})); + return *url_pattern; +} + +const std::string& GetGoogleUrlPattern() { + static base::NoDestructor url_pattern(base::StrCat( + {"https://www.google.(", + base::JoinString(GetGoogleSearchEngineDomainExtensions(), "|"), ")/"})); + return *url_pattern; +} + +const std::string& GetMojeekUrlPattern() { + static base::NoDestructor url_pattern(base::StrCat( + {"https://www.mojeek.(", + base::JoinString(GetMojeekSearchEngineDomainExtensions(), "|"), ")/"})); + return *url_pattern; +} + +const std::string& GetWikipediaUrlPattern() { + static base::NoDestructor url_pattern(base::StrCat( + {"https://(", base::JoinString(GetWikipediaSearchEngineSubdomains(), "|"), + ").wikipedia.org/"})); + return *url_pattern; +} + +const std::string& GetYahooUrlPattern() { + static base::NoDestructor url_pattern(base::StrCat( + {"https://((", base::JoinString(GetYahooSearchEngineSubdomains(), "|"), + ").search.yahoo.com/|search.yahoo.com/)"})); + return *url_pattern; +} + +} // namespace ads diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_url_pattern_util.h b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_url_pattern_util.h new file mode 100644 index 000000000000..2c29d839bd5b --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_url_pattern_util.h @@ -0,0 +1,21 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_URL_PATTERN_UTIL_H_ +#define BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_URL_PATTERN_UTIL_H_ + +#include + +namespace ads { + +const std::string& GetAmazonUrlPattern(); +const std::string& GetGoogleUrlPattern(); +const std::string& GetMojeekUrlPattern(); +const std::string& GetWikipediaUrlPattern(); +const std::string& GetYahooUrlPattern(); + +} // namespace ads + +#endif // BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_URL_PATTERN_UTIL_H_ diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_url_pattern_util_unittest.cc b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_url_pattern_util_unittest.cc new file mode 100644 index 000000000000..ba7544211935 --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_url_pattern_util_unittest.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "bat/ads/internal/base/search_engine_url_pattern_util.h" + +#include "testing/gtest/include/gtest/gtest.h" + +// npm run test -- brave_unit_tests --filter=BatAds* + +namespace ads { + +TEST(BatAdsSearchEngineUrlPatternUtilTest, GetAmazonUrlPattern) { + // Arrange + + // Act + const std::string& url_pattern = GetAmazonUrlPattern(); + + // Assert + EXPECT_EQ( + "https://" + "www.amazon.(ae|ca|cn|co.jp|co.uk|com|com.au|com.br|com.mx|de|eg|es|fr|" + "in|it|nl|pl|sa|se|sp|tr)/", + url_pattern); +} + +TEST(BatAdsSearchEngineUrlPatternUtilTest, GetGoogleUrlPattern) { + // Arrange + + // Act + const std::string& url_pattern = GetGoogleUrlPattern(); + + // Assert + EXPECT_EQ( + "https://" + "www.google.(ac|ad|ae|al|am|as|at|az|ba|be|bf|bg|bi|bj|bs|bt|ca|cat|cd|" + "cf|cg|ch|ci|ci|cl|cm|cn|co.bw|co.ck|co.cr|co.id|co.il|co.im|co.in|co.je|" + "co.jp|co.ke|co.kr|co.ls|co.ma|co.mz|co.nz|co.th|co.tz|co.ug|co.uk|co.uz|" + "co.ve|co.vi|co.za|co.zm|co.zw|com.af|com.ag|com.ai|com.ar|com.au|com.bd|" + "com.bh|com.bn|com.bo|com.br|com.by|com.bz|com.co|com.cu|com.cy|com.do|" + "com.ec|com.eg|com.et|com.fj|com.gh|com.gi|com.gt|com.hk|com.jm|com.kg|" + "com.kh|com.kw|com.lb|com.ly|com.mt|com.mx|com.my|com.na|com.nf|com.ng|" + "com.ni|com.np|com.om|com.pa|com.pe|com.pg|com.ph|com.pk|com.pr|com.py|" + "com.qa|com.sa|com.sb|com.sg|com.sl|com.sv|com.tj|com.tr|com.tw|com.ua|" + "com.uy|com.vc|com.vn|com|cv|cz|de|dj|dk|dm|dz|ee|es|fi|fm|fr|ga|ge|gg|" + "gl|gm|gp|gr|gy|hn|hr|ht|hu|ie|iq|is|it.ao|it|jo|ki|kz|la|li|lk|lt|lu|lv|" + "md|me|mg|mk|ml|mn|ms|mu|mv|mw|ne|nl|no|nr|nu|pl|pn|ps|pt|ro|rs|ru|rw|sc|" + "se|sh|si|sk|sm|sn|so|sr|st|td|tg|tk|tl|tm|tn|to|tt|vg|vu|ws)/", + url_pattern); +} + +TEST(BatAdsSearchEngineUrlPatternUtilTest, GetMojeekUrlPattern) { + // Arrange + + // Act + const std::string& url_pattern = GetMojeekUrlPattern(); + + // Assert + EXPECT_EQ("https://www.mojeek.(co.uk|com)/", url_pattern); +} + +TEST(BatAdsSearchEngineUrlPatternUtilTest, GetWikipediaUrlPattern) { + // Arrange + + // Act + const std::string& url_pattern = GetWikipediaUrlPattern(); + + // Assert + EXPECT_EQ( + "https://" + "(af|ar|arz|ast|az|azb|be|bg|bn|ca|ce|ceb|cs|cy|da|de|el|en|eo|es|et|eu|" + "fa|fi|fr|gl|he|hi|hr|hu|hy|id|it|ja|ka|kk|ko|la|lt|lv|min|mk|ms|my|nan|" + "nl|nn|no|pl|pt|ro|ru|sh|simple|sk|sl|sr|sv|ta|tg|th|tr|tt|uk|ur|uz|vi|" + "vo|war|zh|zh-yue).wikipedia.org/", + url_pattern); +} + +TEST(BatAdsSearchEngineUrlPatternUtilTest, GetYahooUrlPattern) { + // Arrange + + // Act + const std::string& url_pattern = GetYahooUrlPattern(); + + // Assert + EXPECT_EQ( + "https://" + "((au|be|br|ca|de|en-maktoob|es|espanol|fr|fr-be|gr|hk|id|ie|in|it|" + "malaysia|nz|ph|qc|ro|se|sg|tw|uk|vn|www|za).search.yahoo.com/" + "|search.yahoo.com/)", + url_pattern); +} + +} // namespace ads diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_util.cc b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_util.cc index 427107820352..65aa41e70831 100644 --- a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_util.cc +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_util.cc @@ -8,7 +8,8 @@ #include #include "bat/ads/internal/base/search_engine_info.h" -#include "net/base/url_util.h" +#include "bat/ads/internal/base/search_engines_util.h" +#include "third_party/abseil-cpp/absl/types/optional.h" #include "third_party/re2/src/re2/re2.h" #include "url/gurl.h" @@ -16,195 +17,33 @@ namespace ads { namespace { -const std::vector kSearchEngines = { - SearchEngineInfo("Amazon", - "https://amazon.com", - "https://www.amazon.com/exec/obidos/external-search/" - "?field-keywords={searchTerms}&mode=blended", - false), - SearchEngineInfo("Baidu", - "https://baidu.com", - "https://www.baidu.com/s?wd={searchTerms}", - true), - SearchEngineInfo("Bing", - "https://bing.com", - "https://www.bing.com/search?q={searchTerms}", - true), - SearchEngineInfo("DuckDuckGo", - "https://duckduckgo.com", - "https://duckduckgo.com/?q={searchTerms}&t=brave", - true), - SearchEngineInfo("Fireball", - "https://fireball.com", - "https://fireball.com/search?q={searchTerms}", - true), - SearchEngineInfo("GitHub", - "https://github.com", - "https://github.com/search?q={searchTerms}", - false), - SearchEngineInfo( - "Google", - // TODO(https://github.com/brave/brave-browser/issues/8487): Brave Ads - // search engines definition doesn't match all patterns - "https://google.com", - "https://www.google.com/search?q={searchTerms}", - true), - SearchEngineInfo("Google Japan", - "https://google.co.jp", - "https://www.google.co.jp/search?q={searchTerms}", - true), - SearchEngineInfo("Stack Overflow", - "https://stackoverflow.com", - "https://stackoverflow.com/search?q={searchTerms}", - false), - SearchEngineInfo("MDN Web Docs", - "https://developer.mozilla.org", - "https://developer.mozilla.org/search?q={searchTerms}", - false), - SearchEngineInfo( - "Twitter", - "https://twitter.com", - "https://twitter.com/search?q={searchTerms}&source=desktop-search", - false), - SearchEngineInfo( - "Wikipedia", - "https://en.wikipedia.org", - "https://en.wikipedia.org/wiki/Special:Search?search={searchTerms}", - false), - SearchEngineInfo( - "Yahoo", - // TODO(https://github.com/brave/brave-browser/issues/8487): Brave Ads - // search engines definition doesn't match all patterns - "https://search.yahoo.com", - "https://search.yahoo.com/search?p={searchTerms}&fr=opensearch", - true), - SearchEngineInfo( - "Yahoo Japan", - "https://search.yahoo.co.jp", - "https://search.yahoo.co.jp/search?p={searchTerms}&fr=opensearch", - true), - SearchEngineInfo( - "YouTube", - "https://youtube.com", - "https://www.youtube.com/" - "results?search_type=search_videos&search_query={searchTerms}&search_" - "sort=relevance&search_category=0&page=", - false), - SearchEngineInfo( - "StartPage", - // TODO(https://github.com/brave/brave-browser/issues/8487): Brave Ads - // search engines definition doesn't match all patterns - "https://startpage.com", - "https://www.startpage.com/do/" - "dsearch?query={searchTerms}&cat=web&pl=opensearch", - true), - SearchEngineInfo("Infogalactic", - "https://infogalactic.com", - "https://infogalactic.com/w/" - "index.php?title=Special:Search&search={searchTerms}", - false), - SearchEngineInfo("Wolfram Alpha", - "https://wolframalpha.com", - "https://www.wolframalpha.com/input/?i={searchTerms}", - false), - SearchEngineInfo("Semantic Scholar", - "https://semanticscholar.org", - "https://www.semanticscholar.org/search?q={searchTerms}", - true), - SearchEngineInfo("Qwant", - "https://qwant.com", - "https://www.qwant.com/?q={searchTerms}&client=brave", - true), - SearchEngineInfo( - "Yandex", - "https://yandex.com", - "https://yandex.com/search/?text={searchTerms}&clid=2274777", - true), - SearchEngineInfo("Ecosia", - "https://ecosia.org", - "https://www.ecosia.org/search?q={searchTerms}", - true), - SearchEngineInfo("searx", - "https://searx.me", - "https://searx.me/?q={searchTerms}&categories=general", - true), - SearchEngineInfo("findx", - "https://findx.com", - "https://www.findx.com/search?q={searchTerms}&type=web", - true), - SearchEngineInfo("Brave", - "https://search.brave.com/", - "https://search.brave.com/search?q={searchTerms}", - true)}; - -} // namespace - -bool IsSearchEngine(const GURL& url) { +absl::optional FindSearchEngine(const GURL& url) { if (!url.is_valid()) { - return false; + return absl::nullopt; } - bool is_a_search = false; - - for (const auto& search_engine : kSearchEngines) { - const GURL hostname = GURL(search_engine.hostname); - if (!hostname.is_valid()) { - continue; - } - - if (search_engine.is_always_classed_as_a_search && - url.DomainIs(hostname.host_piece())) { - is_a_search = true; - break; - } - - size_t index = search_engine.query.find('{'); - std::string substring = search_engine.query.substr(0, index); - size_t href_index = url.spec().find(substring); - - if (index != std::string::npos && href_index != std::string::npos) { - is_a_search = true; - break; + const GURL url_with_empty_path = url.GetWithEmptyPath(); + const std::vector& search_engines = GetSearchEngines(); + for (const auto& search_engine : search_engines) { + if (RE2::FullMatch(url_with_empty_path.spec(), search_engine.url_pattern) || + RE2::FullMatch(url.spec(), search_engine.url_pattern)) { + return search_engine; } } - return is_a_search; + return absl::nullopt; } -std::string ExtractSearchQueryKeywords(const GURL& url) { - std::string search_query_keywords; - - if (!IsSearchEngine(url)) { - return search_query_keywords; - } - - if (!url.is_valid()) { - return search_query_keywords; - } - - for (const auto& search_engine : kSearchEngines) { - GURL search_engine_hostname = GURL(search_engine.hostname); - if (!search_engine_hostname.is_valid()) { - continue; - } - - if (!url.DomainIs(search_engine_hostname.host_piece())) { - continue; - } - - // Checking if search query in as defined in |search_engine_util.h| is - // defined, e.g. |https://searx.me/?q={searchTerms}&categories=general| - // matches |?q={| - std::string key; - if (!RE2::PartialMatch(search_engine.query, "\\?(.*?)\\={", &key)) { - return search_query_keywords; - } +} // namespace - net::GetValueForKeyInQuery(url, key, &search_query_keywords); - break; +bool IsSearchEngine(const GURL& url) { + const absl::optional search_engine_optional = + FindSearchEngine(url); + if (!search_engine_optional) { + return false; } - return search_query_keywords; + return true; } } // namespace ads diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_util.h b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_util.h index c1cd6d35e399..cc1431eb369c 100644 --- a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_util.h +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_util.h @@ -6,16 +6,12 @@ #ifndef BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_UTIL_H_ #define BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_UTIL_H_ -#include - class GURL; namespace ads { bool IsSearchEngine(const GURL& url); -std::string ExtractSearchQueryKeywords(const GURL& url); - } // namespace ads #endif // BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_UTIL_H_ diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_util_unittest.cc b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_util_unittest.cc new file mode 100644 index 000000000000..ba54880996e5 --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_util_unittest.cc @@ -0,0 +1,201 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "bat/ads/internal/base/search_engine_util.h" + +#include +#include + +#include "base/strings/strcat.h" +#include "bat/ads/internal/base/search_engine_domain_extensions_util.h" +#include "bat/ads/internal/base/search_engine_subdomains_util.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl.h" + +// npm run test -- brave_unit_tests --filter=BatAds* + +namespace ads { + +TEST(BatAdsSearchEngineUtilTest, IsMultilingualAmazonSearchEngine) { + // Arrange + const std::vector& domain_extensions = + GetAmazonSearchEngineDomainExtensions(); + + // Act + for (const auto& domain_extension : domain_extensions) { + const GURL url = + GURL(base::StrCat({"https://www.amazon.", domain_extension, "/"})); + EXPECT_TRUE(IsSearchEngine(url)); + } + + // Assert +} + +TEST(BatAdsSearchEngineUtilTest, IsNotMultilingualAmazonSearchEngine) { + // Arrange + + // Act + EXPECT_FALSE(IsSearchEngine(GURL("https://www.amazon.foobar/"))); + + // Assert +} + +TEST(BatAdsSearchEngineUtilTest, IsMultilingualGoogleSearchEngine) { + // Arrange + const std::vector& domain_extensions = + GetGoogleSearchEngineDomainExtensions(); + + // Act + for (const auto& domain_extension : domain_extensions) { + const GURL url = + GURL(base::StrCat({"https://www.google.", domain_extension, "/"})); + EXPECT_TRUE(IsSearchEngine(url)); + } + + // Assert +} + +TEST(BatAdsSearchEngineUtilTest, IsNotMultilingualGoogleSearchEngine) { + // Arrange + + // Act + EXPECT_FALSE(IsSearchEngine(GURL("https://www.google.foobar/"))); + + // Assert +} + +TEST(BatAdsSearchEngineUtilTest, IsMultilingualMojeekSearchEngine) { + // Arrange + const std::vector& domain_extensions = + GetMojeekSearchEngineDomainExtensions(); + + // Act + for (const auto& domain_extension : domain_extensions) { + const GURL url = + GURL(base::StrCat({"https://www.mojeek.", domain_extension, "/"})); + EXPECT_TRUE(IsSearchEngine(url)); + } + + // Assert +} + +TEST(BatAdsSearchEngineUtilTest, IsNotMultilingualMojeekSearchEngine) { + // Arrange + + // Act + EXPECT_FALSE(IsSearchEngine(GURL("https://www.mojeek.foobar/"))); + + // Assert +} + +TEST(BatAdsSearchEngineUtilTest, IsMultilingualWikipediaSearchEngine) { + // Arrange + const std::vector& subdomains = + GetWikipediaSearchEngineSubdomains(); + + // Act + for (const auto& subdomain : subdomains) { + const GURL url = + GURL(base::StrCat({"https://", subdomain, ".wikipedia.org/"})); + EXPECT_TRUE(IsSearchEngine(url)); + } + + // Assert +} + +TEST(BatAdsSearchEngineUtilTest, IsNotMultilingualWikipediaSearchEngine) { + // Arrange + + // Act + EXPECT_FALSE(IsSearchEngine(GURL("https://foobar.wikipedia.org/"))); + + // Assert +} + +TEST(BatAdsSearchEngineUtilTest, IsMultilingualYahooSearchEngine) { + // Arrange + const std::vector& subdomains = GetYahooSearchEngineSubdomains(); + + // Act + for (const auto& subdomain : subdomains) { + const GURL url = + GURL(base::StrCat({"https://", subdomain, ".search.yahoo.com/"})); + EXPECT_TRUE(IsSearchEngine(url)); + } + + // Assert +} + +TEST(BatAdsSearchEngineUtilTest, IsNotMultilingualYahooSearchEngine) { + // Arrange + + // Act + EXPECT_FALSE(IsSearchEngine(GURL("https://foobar.search.yahoo.com/"))); + + // Assert +} + +TEST(BatAdsSearchEngineUtilTest, IsMonolingualSearchEngine) { + // Arrange + std::vector urls = {GURL(R"(https://developer.mozilla.org/en-US/)"), + GURL(R"(https://duckduckgo.com/)"), + GURL(R"(https://fireball.de/)"), + GURL(R"(https://github.com/)"), + GURL(R"(https://infogalactic.com/)"), + GURL(R"(https://search.brave.com/)"), + GURL(R"(https://search.yahoo.com/)"), + GURL(R"(https://stackoverflow.com/)"), + GURL(R"(https://swisscows.com/)"), + GURL(R"(https://twitter.com/explore/)"), + GURL(R"(https://www.baidu.com/)"), + GURL(R"(https://www.bing.com/)"), + GURL(R"(https://www.dogpile.com/)"), + GURL(R"(https://www.ecosia.org/)"), + GURL(R"(https://www.excite.com/)"), + GURL(R"(https://www.findx.com/)"), + GURL(R"(https://www.gigablast.com/)"), + GURL(R"(https://www.lycos.com/)"), + GURL(R"(https://www.metacrawler.com/)"), + GURL(R"(https://www.petalsearch.com/)"), + GURL(R"(https://www.qwant.com/)"), + GURL(R"(https://www.semanticscholar.org/)"), + GURL(R"(https://www.sogou.com/)"), + GURL(R"(https://www.startpage.com/)"), + GURL(R"(https://www.webcrawler.com/)"), + GURL(R"(https://www.wolframalpha.com/)"), + GURL(R"(https://www.youtube.com/)"), + GURL(R"(https://yandex.com/)")}; + + // Act + for (const auto& url : urls) { + EXPECT_TRUE(IsSearchEngine(url)); + } + + // Assert +} + +TEST(BatAdsSearchEngineUtilTest, IsNotSearchEngine) { + // Arrange + const GURL url = GURL("https://foobar.com/"); + + // Act + const bool is_search_engine = IsSearchEngine(url); + + // Assert + EXPECT_FALSE(is_search_engine); +} + +TEST(BatAdsSearchEngineUtilTest, IsNotSearchEngineWithInvalidUrl) { + // Arrange + const GURL url = GURL("INVALID_URL"); + + // Act + const bool is_search_engine = IsSearchEngine(url); + + // Assert + EXPECT_FALSE(is_search_engine); +} + +} // namespace ads diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engines_util.cc b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engines_util.cc new file mode 100644 index 000000000000..be7df975019e --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engines_util.cc @@ -0,0 +1,61 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "bat/ads/internal/base/search_engines_util.h" + +#include "base/no_destructor.h" +#include "bat/ads/internal/base/search_engine_info.h" +#include "bat/ads/internal/base/search_engine_results_page_url_pattern_util.h" +#include "bat/ads/internal/base/search_engine_url_pattern_util.h" + +namespace ads { + +const std::vector& GetSearchEngines() { + static base::NoDestructor> search_engines( + {{"https://ask.com/", "https://ask.com/web", "q"}, + {"https://developer.mozilla.org/(.*)/", + "https://developer.mozilla.org/(.*)/search", "q"}, + {"https://duckduckgo.com/", "https://duckduckgo.com/", "q"}, + {"https://fireball.de/", "https://fireball.de/search", "q"}, + {"https://github.com/", "https://github.com/search", "q"}, + {"https://infogalactic.com/", "https://infogalactic.com/info/(.*)", {}}, + {"https://search.brave.com/", "https://search.brave.com/search", "q"}, + {"https://stackoverflow.com/", "https://stackoverflow.com/search", "q"}, + {"https://swisscows.com/", "https://swisscows.com/web", "query"}, + {"https://twitter.com/explore/", "https://twitter.com/search", "q"}, + {"https://www.baidu.com/", "https://www.baidu.com/s", "wd"}, + {"https://www.bing.com/", "https://www.bing.com/search", "q"}, + {"https://www.dogpile.com/", "https://www.dogpile.com/serp", "q"}, + {"https://www.ecosia.org/", "https://www.ecosia.org/search", "q"}, + {"https://www.excite.com/", "https://results.excite.com/serp", "q"}, + {"https://www.findx.com/", "https://www.findx.com/search", "q"}, + {"https://www.gigablast.com/", "https://www.gigablast.com/search", "q"}, + {"https://www.lycos.com/", "https://search.lycos.com/web/", "q"}, + {"https://www.metacrawler.com/", "https://www.metacrawler.com/serp", + "q"}, + {"https://www.petalsearch.com/", "https://www.petalsearch.com/search", + "query"}, + {"https://www.qwant.com/", "https://www.qwant.com/", "q"}, + {"https://www.semanticscholar.org/", + "https://www.semanticscholar.org/search", "q"}, + {"https://www.sogou.com/", "https://www.sogou.com/web", "query"}, + {"https://www.startpage.com/", + "https://www.startpage.com/sp/search", + {}}, + {"https://www.webcrawler.com/", "https://www.webcrawler.com/serp", "q"}, + {"https://www.wolframalpha.com/", "https://www.wolframalpha.com/input", + "i"}, + {"https://www.youtube.com/", "https://www.youtube.com/results", + "search_query"}, + {"https://yandex.com/", "https://yandex.com/search/", "text"}, + {GetAmazonUrlPattern(), GetAmazonResultsPageUrlPattern(), "k"}, + {GetGoogleUrlPattern(), GetGoogleResultsPageUrlPattern(), "q"}, + {GetMojeekUrlPattern(), GetMojeekResultsPageUrlPattern(), "q"}, + {GetWikipediaUrlPattern(), GetWikipediaResultsPageUrlPattern(), {}}, + {GetYahooUrlPattern(), GetYahooResultsPageUrlPattern(), "p"}}); + return *search_engines; +} + +} // namespace ads diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/search_engines_util.h b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engines_util.h new file mode 100644 index 000000000000..19beb6e07f5d --- /dev/null +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/search_engines_util.h @@ -0,0 +1,19 @@ +/* Copyright (c) 2022 The Brave Authors. All rights reserved. + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINES_UTIL_H_ +#define BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINES_UTIL_H_ + +#include + +namespace ads { + +struct SearchEngineInfo; + +const std::vector& GetSearchEngines(); + +} // namespace ads + +#endif // BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINES_UTIL_H_ diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/url_util.cc b/vendor/bat-native-ads/src/bat/ads/internal/base/url_util.cc index 910586702971..94351833a3b7 100644 --- a/vendor/bat-native-ads/src/bat/ads/internal/base/url_util.cc +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/url_util.cc @@ -5,7 +5,7 @@ #include "bat/ads/internal/base/url_util.h" -#include "base/check.h" +#include "base/strings/strcat.h" #include "net/base/registry_controlled_domains/registry_controlled_domain.h" #include "third_party/re2/src/re2/re2.h" #include "url/gurl.h" @@ -13,6 +13,11 @@ namespace ads { +GURL GetUrlWithEmptyQuery(const GURL& url) { + return GURL(base::StrCat( + {url.scheme(), url::kStandardSchemeSeparator, url.host(), url.path()})); +} + bool DoesUrlMatchPattern(const GURL& url, const std::string& pattern) { if (!url.is_valid() || pattern.empty()) { return false; diff --git a/vendor/bat-native-ads/src/bat/ads/internal/base/url_util.h b/vendor/bat-native-ads/src/bat/ads/internal/base/url_util.h index 5de136932eba..33b8d921b20e 100644 --- a/vendor/bat-native-ads/src/bat/ads/internal/base/url_util.h +++ b/vendor/bat-native-ads/src/bat/ads/internal/base/url_util.h @@ -13,6 +13,8 @@ class GURL; namespace ads { +GURL GetUrlWithEmptyQuery(const GURL& url); + bool DoesUrlMatchPattern(const GURL& url, const std::string& pattern); bool SameDomainOrHost(const GURL& lhs, const GURL& rhs); diff --git a/vendor/bat-native-ads/src/bat/ads/internal/conversions/sorts/conversions_ascending_sort.cc b/vendor/bat-native-ads/src/bat/ads/internal/conversions/sorts/conversions_ascending_sort.cc index ed37d036764c..7edee68fabfc 100644 --- a/vendor/bat-native-ads/src/bat/ads/internal/conversions/sorts/conversions_ascending_sort.cc +++ b/vendor/bat-native-ads/src/bat/ads/internal/conversions/sorts/conversions_ascending_sort.cc @@ -18,8 +18,8 @@ ConversionList ConversionsAscendingSort::Apply( auto sorted_list = list; std::sort(sorted_list.begin(), sorted_list.end(), - [](const ConversionInfo& a, const ConversionInfo& b) { - return a.type == "postview" && b.type == "postclick"; + [](const ConversionInfo& lhs, const ConversionInfo& rhs) { + return lhs.type == "postview" && rhs.type == "postclick"; }); return sorted_list; diff --git a/vendor/bat-native-ads/src/bat/ads/internal/conversions/sorts/conversions_descending_sort.cc b/vendor/bat-native-ads/src/bat/ads/internal/conversions/sorts/conversions_descending_sort.cc index ccb646d696f5..e8ee86149947 100644 --- a/vendor/bat-native-ads/src/bat/ads/internal/conversions/sorts/conversions_descending_sort.cc +++ b/vendor/bat-native-ads/src/bat/ads/internal/conversions/sorts/conversions_descending_sort.cc @@ -18,8 +18,8 @@ ConversionList ConversionsDescendingSort::Apply( auto sorted_list = list; std::sort(sorted_list.begin(), sorted_list.end(), - [](const ConversionInfo& a, const ConversionInfo& b) { - return a.type == "postclick" && b.type == "postview"; + [](const ConversionInfo& lhs, const ConversionInfo& rhs) { + return lhs.type == "postclick" && rhs.type == "postview"; }); return sorted_list; diff --git a/vendor/bat-native-ads/src/bat/ads/internal/processors/behavioral/purchase_intent/purchase_intent_processor.cc b/vendor/bat-native-ads/src/bat/ads/internal/processors/behavioral/purchase_intent/purchase_intent_processor.cc index 7df5102dac39..a5b96dacbf04 100644 --- a/vendor/bat-native-ads/src/bat/ads/internal/processors/behavioral/purchase_intent/purchase_intent_processor.cc +++ b/vendor/bat-native-ads/src/bat/ads/internal/processors/behavioral/purchase_intent/purchase_intent_processor.cc @@ -12,7 +12,7 @@ #include "base/strings/string_split.h" #include "base/strings/string_util.h" #include "bat/ads/internal/base/logging_util.h" -#include "bat/ads/internal/base/search_engine_util.h" +#include "bat/ads/internal/base/search_engine_results_page_util.h" #include "bat/ads/internal/base/string_util.h" #include "bat/ads/internal/base/url_util.h" #include "bat/ads/internal/deprecated/client/client.h" @@ -21,6 +21,7 @@ #include "bat/ads/internal/resources/behavioral/purchase_intent/purchase_intent_resource.h" #include "bat/ads/internal/resources/behavioral/purchase_intent/purchase_intent_signal_history_info.h" #include "bat/ads/internal/resources/behavioral/purchase_intent/purchase_intent_site_info.h" +#include "third_party/abseil-cpp/absl/types/optional.h" namespace ads { namespace processor { @@ -113,9 +114,11 @@ targeting::PurchaseIntentSignalInfo PurchaseIntent::ExtractSignal( const GURL& url) const { targeting::PurchaseIntentSignalInfo signal_info; - const std::string search_query = ExtractSearchQueryKeywords(url); + const absl::optional search_query_optional = + ExtractSearchTermQueryValue(url); + if (search_query_optional) { + const std::string& search_query = search_query_optional.value(); - if (!search_query.empty()) { const SegmentList keyword_segments = GetSegmentsForSearchQuery(search_query); diff --git a/vendor/bat-native-ads/src/bat/ads/internal/processors/behavioral/purchase_intent/purchase_intent_processor_unittest.cc b/vendor/bat-native-ads/src/bat/ads/internal/processors/behavioral/purchase_intent/purchase_intent_processor_unittest.cc index d884a8bdc8a5..fc960cdb669b 100644 --- a/vendor/bat-native-ads/src/bat/ads/internal/processors/behavioral/purchase_intent/purchase_intent_processor_unittest.cc +++ b/vendor/bat-native-ads/src/bat/ads/internal/processors/behavioral/purchase_intent/purchase_intent_processor_unittest.cc @@ -225,7 +225,8 @@ TEST_F(BatAdsPurchaseIntentProcessorTest, ProcessMultipleUniqueKeywords) { FastForwardClockBy(base::Minutes(5)); const base::Time now_2 = Now(); - const GURL url_2 = GURL("https://google.com/?q=segment+keyword+1&bar=foo"); + const GURL url_2 = + GURL("https://www.google.com/search?q=segment+keyword+1&bar=foo"); processor.Process(url_2); // Assert