Skip to content

Commit

Permalink
Classify search engine results page content
Browse files Browse the repository at this point in the history
  • Loading branch information
tmancey committed May 31, 2022
1 parent 43d559a commit 21e92e2
Show file tree
Hide file tree
Showing 29 changed files with 1,085 additions and 216 deletions.
4 changes: 4 additions & 0 deletions components/brave_ads/test/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ source_set("brave_ads_unit_tests") {
"//brave/vendor/bat-native-ads/src/bat/ads/internal/base/number_util_unittest.cc",
"//brave/vendor/bat-native-ads/src/bat/ads/internal/base/platform_helper_mock.cc",
"//brave/vendor/bat-native-ads/src/bat/ads/internal/base/platform_helper_mock.h",
"//brave/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_url_pattern_util_unittest.cc",
"//brave/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_results_page_util_unittest.cc",
"//brave/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_url_pattern_util_unittest.cc",
"//brave/vendor/bat-native-ads/src/bat/ads/internal/base/search_engine_util_unittest.cc",
"//brave/vendor/bat-native-ads/src/bat/ads/internal/base/string_util_unittest.cc",
"//brave/vendor/bat-native-ads/src/bat/ads/internal/base/subdivision_code_util_unittest.cc",
"//brave/vendor/bat-native-ads/src/bat/ads/internal/base/time_constraint_util_unittest.cc",
Expand Down
12 changes: 12 additions & 0 deletions vendor/bat-native-ads/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -350,10 +350,22 @@ source_set("ads") {
"src/bat/ads/internal/base/platform_helper.cc",
"src/bat/ads/internal/base/platform_helper.h",
"src/bat/ads/internal/base/platform_helper_types.h",
"src/bat/ads/internal/base/search_engine_domain_extensions_util.cc",
"src/bat/ads/internal/base/search_engine_domain_extensions_util.h",
"src/bat/ads/internal/base/search_engine_info.cc",
"src/bat/ads/internal/base/search_engine_info.h",
"src/bat/ads/internal/base/search_engine_results_page_url_pattern_util.cc",
"src/bat/ads/internal/base/search_engine_results_page_url_pattern_util.h",
"src/bat/ads/internal/base/search_engine_results_page_util.cc",
"src/bat/ads/internal/base/search_engine_results_page_util.h",
"src/bat/ads/internal/base/search_engine_subdomains_util.cc",
"src/bat/ads/internal/base/search_engine_subdomains_util.h",
"src/bat/ads/internal/base/search_engine_url_pattern_util.cc",
"src/bat/ads/internal/base/search_engine_url_pattern_util.h",
"src/bat/ads/internal/base/search_engine_util.cc",
"src/bat/ads/internal/base/search_engine_util.h",
"src/bat/ads/internal/base/search_engines_util.cc",
"src/bat/ads/internal/base/search_engines_util.h",
"src/bat/ads/internal/base/string_util.cc",
"src/bat/ads/internal/base/string_util.h",
"src/bat/ads/internal/base/subdivision_code_util.cc",
Expand Down
18 changes: 10 additions & 8 deletions vendor/bat-native-ads/src/bat/ads/internal/ads_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "bat/ads/internal/ads_client_helper.h"
#include "bat/ads/internal/base/logging_util.h"
#include "bat/ads/internal/base/platform_helper.h"
#include "bat/ads/internal/base/search_engine_results_page_util.h"
#include "bat/ads/internal/base/search_engine_util.h"
#include "bat/ads/internal/base/string_util.h"
#include "bat/ads/internal/base/time_formatting_util.h"
Expand Down Expand Up @@ -174,12 +175,12 @@ void AdsImpl::OnHtmlLoaded(const int32_t tab_id,

const uint32_t hash = base::FastHash(html);
if (hash == last_html_loaded_hash_) {
BLOG(1, "HTML content has not changed");
return;
}
last_html_loaded_hash_ = hash;

transfer_->MaybeTransferAd(tab_id, redirect_chain);

conversions_->MaybeConvert(
redirect_chain, html,
conversions_resource_->get()->conversion_id_patterns);
Expand All @@ -196,15 +197,14 @@ void AdsImpl::OnTextLoaded(const int32_t tab_id,

const uint32_t hash = base::FastHash(text);
if (hash == last_text_loaded_hash_) {
BLOG(1, "Text content has not changed");
return;
}
last_text_loaded_hash_ = hash;

const GURL& url = redirect_chain.back();

if (!url.SchemeIsHTTPOrHTTPS()) {
BLOG(1, "Visited URL is not supported");
BLOG(1, url.scheme() << " scheme is not supported for text content");
return;
}

Expand All @@ -215,12 +215,14 @@ void AdsImpl::OnTextLoaded(const int32_t tab_id,
purchase_intent_processor_->Process(url);
}

if (IsSearchEngine(url)) {
BLOG(1, "Search engine pages are not supported for text classification");
} else {
const std::string stripped_text = StripNonAlphaCharacters(text);
text_classification_processor_->Process(stripped_text);
if (IsSearchEngine(url) && !IsSearchEngineResultsPage(url)) {
BLOG(1,
"Search engine landing page is not supported for text classification");
return;
}

const std::string stripped_text = StripNonAlphaCharacters(text);
text_classification_processor_->Process(stripped_text);
}

void AdsImpl::OnUserGesture(const int32_t page_transition_type) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/* Copyright (c) 2022 The Brave Authors. All rights reserved.
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "bat/ads/internal/base/search_engine_domain_extensions_util.h"

#include "base/no_destructor.h"

namespace ads {

const std::vector<std::string>& GetAmazonSearchEngineDomainExtensions() {
// See https://www.amazon.com/gp/navigation-country/select-country.
static base::NoDestructor<std::vector<std::string>> extensions(
{"ae", "ca", "cn", "co.jp", "co.uk", "com", "com.au",
"com.br", "com.mx", "de", "eg", "es", "fr", "in",
"it", "nl", "pl", "sa", "se", "sp", "tr"});
return *extensions;
}

const std::vector<std::string>& GetGoogleSearchEngineDomainExtensions() {
static base::NoDestructor<std::vector<std::string>> extensions(
{"ac", "ad", "ae", "al", "am", "as", "at",
"az", "ba", "be", "bf", "bg", "bi", "bj",
"bs", "bt", "ca", "cat", "cd", "cf", "cg",
"ch", "ci", "ci", "cl", "cm", "cn", "co.bw",
"co.ck", "co.cr", "co.id", "co.il", "co.im", "co.in", "co.je",
"co.jp", "co.ke", "co.kr", "co.ls", "co.ma", "co.mz", "co.nz",
"co.th", "co.tz", "co.ug", "co.uk", "co.uz", "co.ve", "co.vi",
"co.za", "co.zm", "co.zw", "com.af", "com.ag", "com.ai", "com.ar",
"com.au", "com.bd", "com.bh", "com.bn", "com.bo", "com.br", "com.by",
"com.bz", "com.co", "com.cu", "com.cy", "com.do", "com.ec", "com.eg",
"com.et", "com.fj", "com.gh", "com.gi", "com.gt", "com.hk", "com.jm",
"com.kg", "com.kh", "com.kw", "com.lb", "com.ly", "com.mt", "com.mx",
"com.my", "com.na", "com.nf", "com.ng", "com.ni", "com.np", "com.om",
"com.pa", "com.pe", "com.pg", "com.ph", "com.pk", "com.pr", "com.py",
"com.qa", "com.sa", "com.sb", "com.sg", "com.sl", "com.sv", "com.tj",
"com.tr", "com.tw", "com.ua", "com.uy", "com.vc", "com.vn", "com",
"cv", "cz", "de", "dj", "dk", "dm", "dz",
"ee", "es", "fi", "fm", "fr", "ga", "ge",
"gg", "gl", "gm", "gp", "gr", "gy", "hn",
"hr", "ht", "hu", "ie", "iq", "is", "it.ao",
"it", "jo", "ki", "kz", "la", "li", "lk",
"lt", "lu", "lv", "md", "me", "mg", "mk",
"ml", "mn", "ms", "mu", "mv", "mw", "ne",
"nl", "no", "nr", "nu", "pl", "pn", "ps",
"pt", "ro", "rs", "ru", "rw", "sc", "se",
"sh", "si", "sk", "sm", "sn", "so", "sr",
"st", "td", "tg", "tk", "tl", "tm", "tn",
"to", "tt", "vg", "vu", "ws"});
return *extensions;
}

const std::vector<std::string>& GetMojeekSearchEngineDomainExtensions() {
static base::NoDestructor<std::vector<std::string>> extensions(
{"co.uk", "com"});
return *extensions;
}

} // namespace ads
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/* Copyright (c) 2022 The Brave Authors. All rights reserved.
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifndef BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_DOMAIN_EXTENSIONS_UTIL_H_
#define BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_DOMAIN_EXTENSIONS_UTIL_H_

#include <string>
#include <vector>

namespace ads {

const std::vector<std::string>& GetAmazonSearchEngineDomainExtensions();
const std::vector<std::string>& GetGoogleSearchEngineDomainExtensions();
const std::vector<std::string>& GetMojeekSearchEngineDomainExtensions();

} // namespace ads

#endif // BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_DOMAIN_EXTENSIONS_UTIL_H_
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,12 @@ namespace ads {

SearchEngineInfo::SearchEngineInfo() = default;

SearchEngineInfo::SearchEngineInfo(const std::string& name,
const std::string& hostname,
const std::string& query,
bool is_always_classed_as_a_search)
: name(name),
hostname(hostname),
query(query),
is_always_classed_as_a_search(is_always_classed_as_a_search) {}
SearchEngineInfo::SearchEngineInfo(const std::string& url_pattern,
const std::string& result_page_url_pattern,
const std::string& search_term_query_key)
: url_pattern(url_pattern),
result_page_url_pattern(result_page_url_pattern),
search_term_query_key(search_term_query_key) {}

SearchEngineInfo::SearchEngineInfo(const SearchEngineInfo& info) = default;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,15 @@ namespace ads {
struct SearchEngineInfo final {
public:
SearchEngineInfo();
SearchEngineInfo(const std::string& name,
const std::string& hostname,
const std::string& query,
bool is_always_classed_as_a_search);
SearchEngineInfo(const std::string& url_pattern,
const std::string& result_page_url_pattern,
const std::string& search_term_query_key);
SearchEngineInfo(const SearchEngineInfo& info);
~SearchEngineInfo();

std::string name;
std::string hostname;
std::string query;
bool is_always_classed_as_a_search = false;
std::string url_pattern;
std::string result_page_url_pattern;
std::string search_term_query_key;
};

} // namespace ads
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/* Copyright (c) 2022 The Brave Authors. All rights reserved.
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "bat/ads/internal/base/search_engine_results_page_url_pattern_util.h"

#include "base/no_destructor.h"
#include "base/strings/strcat.h"
#include "bat/ads/internal/base/search_engine_url_pattern_util.h"

namespace ads {

const std::string& GetAmazonResultsPageUrlPattern() {
static base::NoDestructor<std::string> url_pattern(
base::StrCat({GetAmazonUrlPattern(), "s"}));
return *url_pattern;
}

const std::string& GetGoogleResultsPageUrlPattern() {
static base::NoDestructor<std::string> url_pattern(
base::StrCat({GetGoogleUrlPattern(), "search"}));
return *url_pattern;
}

const std::string& GetMojeekResultsPageUrlPattern() {
static base::NoDestructor<std::string> url_pattern(
base::StrCat({GetMojeekUrlPattern(), "search"}));
return *url_pattern;
}

const std::string& GetWikipediaResultsPageUrlPattern() {
static base::NoDestructor<std::string> url_pattern(
base::StrCat({GetWikipediaUrlPattern(), "wiki/(.*)"}));
return *url_pattern;
}

const std::string& GetYahooResultsPageUrlPattern() {
static base::NoDestructor<std::string> url_pattern(
base::StrCat({GetYahooUrlPattern(), "search(.*)"}));
return *url_pattern;
}

} // namespace ads
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/* Copyright (c) 2022 The Brave Authors. All rights reserved.
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifndef BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_RESULTS_PAGE_URL_PATTERN_UTIL_H_
#define BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_RESULTS_PAGE_URL_PATTERN_UTIL_H_

#include <string>

namespace ads {

const std::string& GetAmazonResultsPageUrlPattern();
const std::string& GetGoogleResultsPageUrlPattern();
const std::string& GetMojeekResultsPageUrlPattern();
const std::string& GetWikipediaResultsPageUrlPattern();
const std::string& GetYahooResultsPageUrlPattern();

} // namespace ads

#endif // BRAVE_VENDOR_BAT_NATIVE_ADS_SRC_BAT_ADS_INTERNAL_BASE_SEARCH_ENGINE_RESULTS_PAGE_URL_PATTERN_UTIL_H_
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/* Copyright (c) 2022 The Brave Authors. All rights reserved.
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "bat/ads/internal/base/search_engine_results_page_url_pattern_util.h"

#include "testing/gtest/include/gtest/gtest.h"

// npm run test -- brave_unit_tests --filter=BatAds*

namespace ads {

TEST(BatAdsSearchEngineResultsPageUrlPatternUtilTest,
GetAmazonResultsPageUrlPattern) {
// Arrange

// Act
const std::string& url_pattern = GetAmazonResultsPageUrlPattern();

// Assert
EXPECT_EQ(
"https://"
"www.amazon.(ae|ca|cn|co.jp|co.uk|com|com.au|com.br|com.mx|de|eg|es|fr|"
"in|it|nl|pl|sa|se|sp|tr)/s",
url_pattern);
}

TEST(BatAdsSearchEngineResultsPageUrlPatternUtilTest,
GetGoogleResultsPageUrlPattern) {
// Arrange

// Act
const std::string& url_pattern = GetGoogleResultsPageUrlPattern();

// Assert
EXPECT_EQ(
"https://"
"www.google.(ac|ad|ae|al|am|as|at|az|ba|be|bf|bg|bi|bj|bs|bt|ca|cat|cd|"
"cf|cg|ch|ci|ci|cl|cm|cn|co.bw|co.ck|co.cr|co.id|co.il|co.im|co.in|co.je|"
"co.jp|co.ke|co.kr|co.ls|co.ma|co.mz|co.nz|co.th|co.tz|co.ug|co.uk|co.uz|"
"co.ve|co.vi|co.za|co.zm|co.zw|com.af|com.ag|com.ai|com.ar|com.au|com.bd|"
"com.bh|com.bn|com.bo|com.br|com.by|com.bz|com.co|com.cu|com.cy|com.do|"
"com.ec|com.eg|com.et|com.fj|com.gh|com.gi|com.gt|com.hk|com.jm|com.kg|"
"com.kh|com.kw|com.lb|com.ly|com.mt|com.mx|com.my|com.na|com.nf|com.ng|"
"com.ni|com.np|com.om|com.pa|com.pe|com.pg|com.ph|com.pk|com.pr|com.py|"
"com.qa|com.sa|com.sb|com.sg|com.sl|com.sv|com.tj|com.tr|com.tw|com.ua|"
"com.uy|com.vc|com.vn|com|cv|cz|de|dj|dk|dm|dz|ee|es|fi|fm|fr|ga|ge|gg|"
"gl|gm|gp|gr|gy|hn|hr|ht|hu|ie|iq|is|it.ao|it|jo|ki|kz|la|li|lk|lt|lu|lv|"
"md|me|mg|mk|ml|mn|ms|mu|mv|mw|ne|nl|no|nr|nu|pl|pn|ps|pt|ro|rs|ru|rw|sc|"
"se|sh|si|sk|sm|sn|so|sr|st|td|tg|tk|tl|tm|tn|to|tt|vg|vu|ws)/search",
url_pattern);
}

TEST(BatAdsSearchEngineResultsPageUrlPatternUtilTest,
GetMojeekResultsPageUrlPattern) {
// Arrange

// Act
const std::string& url_pattern = GetMojeekResultsPageUrlPattern();

// Assert
EXPECT_EQ("https://www.mojeek.(co.uk|com)/search", url_pattern);
}

TEST(BatAdsSearchEngineResultsPageUrlPatternUtilTest,
GetWikipediaResultsPageUrlPattern) {
// Arrange

// Act
const std::string& url_pattern = GetWikipediaResultsPageUrlPattern();

// Assert
EXPECT_EQ(
"https://"
"(af|ar|arz|ast|az|azb|be|bg|bn|ca|ce|ceb|cs|cy|da|de|el|en|eo|es|et|eu|"
"fa|fi|fr|gl|he|hi|hr|hu|hy|id|it|ja|ka|kk|ko|la|lt|lv|min|mk|ms|my|nan|"
"nl|nn|no|pl|pt|ro|ru|sh|simple|sk|sl|sr|sv|ta|tg|th|tr|tt|uk|ur|uz|vi|"
"vo|war|zh|zh-yue).wikipedia.org/wiki/(.*)",
url_pattern);
}

TEST(BatAdsSearchEngineResultsPageUrlPatternUtilTest,
GetYahooResultsPageUrlPattern) {
// Arrange

// Act
const std::string& url_pattern = GetYahooResultsPageUrlPattern();

// Assert
EXPECT_EQ(
"https://"
"((au|be|br|ca|de|en-maktoob|es|espanol|fr|fr-be|gr|hk|id|ie|in|it|"
"malaysia|nz|ph|qc|ro|se|sg|tw|uk|vn|www|za).search.yahoo.com/"
"|search.yahoo.com/)search(.*)",
url_pattern);
}

} // namespace ads
Loading

0 comments on commit 21e92e2

Please sign in to comment.