From 988915cb64f1dfe85bd62c3cae3fa20873af73b4 Mon Sep 17 00:00:00 2001 From: omrilotan Date: Tue, 9 Jan 2024 22:39:35 +0000 Subject: [PATCH 1/8] Add a naive regex fallback --- CHANGELOG.md | 5 ++ README.md | 10 +++- package.json | 2 +- scripts/build/pattern.js | 15 +++-- src/index.ts | 25 ++++++++- src/patterns.json | 6 +- tests/spec/__snapshots__/test.ts.snap | 46 +++++++++++++++ tests/spec/test.ts | 80 +++++++++++++++++++++++++++ 8 files changed, 175 insertions(+), 14 deletions(-) create mode 100644 tests/spec/__snapshots__/test.ts.snap diff --git a/CHANGELOG.md b/CHANGELOG.md index a32f1af..6b829bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## [4.4.0](https://github.com/omrilotan/isbot/compare/v4.3.0...v4.4.0) + +- Add a naive fallback pattern for engines that do not support lookbehind in regular expressions +- Add isbotNaive function to identify bots using a naive approach (simpler and faster) + ## [4.3.0](https://github.com/omrilotan/isbot/compare/v4.2.0...v4.3.0) - Accept `undefined` in place of user agent string to allow headers property to be used "as is" (`request.headers["user-agent"]`) diff --git a/README.md b/README.md index cb71842..a3af73b 100644 --- a/README.md +++ b/README.md @@ -39,10 +39,18 @@ Using JSDeliver CDN you can import an iife script // isbot is global isbot(navigator.userAgent) ``` -## Additional named imports +## How `isbot` maintains accuracy + +> `isbot`'s main feature is the accurate identification of bots using a regular expression. It uses expansive and regularly updated lists of user agent strings to create a regular expression that matches bots and only bots. +> +> This is done by using a lookbehind pattern which is not supported in all environments. A fallback is provided for environments that do not support lookbehind which is less accurate. The test suite includes a percentage of false positives and false negatives which is deemed acceptable for the fallback: 1% false positive and 75% bot coverage. + +## All named imports | import | Type | Description | | ------------------- | ------------------------------------------------- | ---------------------------------------------------------------------------- | +| isbot | _(userAgent: string): boolean_ | Check if the user agent is a bot | +| isbotNaive | _(userAgent: string): boolean_ | Check if the user agent is a bot using a naive pattern (less accurate) | | pattern | _RegExp_ | The regular expression used to identify bots | | list | _string[]_ | List of all individual pattern parts | | isbotMatch | _(userAgent: string): string \| null_ | The substring matched by the regular expression | diff --git a/package.json b/package.json index 29da451..72d76fa 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "isbot", - "version": "4.3.0", + "version": "4.4.0", "description": "🤖 Recognise bots/crawlers/spiders using the user agent string.", "keywords": [ "bot", diff --git a/scripts/build/pattern.js b/scripts/build/pattern.js index 8bc2a69..1acfda9 100755 --- a/scripts/build/pattern.js +++ b/scripts/build/pattern.js @@ -3,10 +3,13 @@ import { writeFile } from "node:fs/promises"; import patterns from "../../src/patterns.json" assert { type: "json" }; -const pattern = new RegExp(patterns.join("|"), "i").toString(); -const code = ` -export const regex: RegExp = ${pattern}; -export const parts: number = ${patterns.length}; -export const size: number = ${pattern.length}; -`.trim(); +const pattern = new RegExp( + patterns + .map((pattern) => pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")) + .join("|"), +) + .toString() + .slice(1, -1); + +const code = `export const expression: string = "${pattern}";`; await writeFile("src/pattern.ts", code); diff --git a/src/index.ts b/src/index.ts index 7a14074..838e47c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,18 +1,37 @@ -import { regex } from "./pattern"; +import { expression } from "./pattern"; import patternsList from "./patterns.json"; -// Workaround for TypeScript's type definition of imported variables and JSON files. +/** + * Naive bot pattern. + */ +const naiveExpression = /bot|spider|crawl|http|lighthouse/i; /** * A pattern that matches bot identifiers in user agent strings. */ -export const pattern: RegExp = regex; +let regex: RegExp; + +try { + regex = new RegExp(expression, "i"); +} catch (error) { + regex = naiveExpression; +} + +export const pattern = regex; + +// Workaround for TypeScript's type definition of imported variables and JSON files. /** * A list of bot identifiers to be used in a regular expression against user agent strings. */ export const list: string[] = patternsList; +/** + * Check if the given user agent includes a bot pattern. + */ +export const isbotNaive = (userAgent?: string | null): boolean => + Boolean(userAgent) && naiveExpression.test(userAgent); + /** * Check if the given user agent includes a bot pattern. */ diff --git a/src/patterns.json b/src/patterns.json index eb8621d..6cd21ad 100644 --- a/src/patterns.json +++ b/src/patterns.json @@ -11,7 +11,6 @@ "(? { describe("features", () => { test("pattern: pattern is a regex", () => { @@ -79,6 +91,65 @@ describe("isbot", () => { ); }); + describe("isbotNaive", () => { + test.each([75])( + "a large number of user agent strings can be detected (>%s%)", + (percent) => { + const ratio = + crawlers.filter((ua) => isbotNaive(ua)).length / crawlers.length; + expect(ratio).toBeLessThan(1); + expect(ratio).toBeGreaterThan(percent / 100); + }, + ); + test.each([1])( + "a small number of browsers is falsly detected as bots (<%s%)", + (percent) => { + const ratio = + browsers.filter((ua) => isbotNaive(ua)).length / browsers.length; + expect(ratio).toBeGreaterThan(0); + expect(ratio).toBeLessThan(percent / 100); + }, + ); + }); + + describe("regex fallback", () => { + beforeAll(async () => { + jest + .spyOn(globalThis, "RegExp") + .mockImplementation((pattern, flags): RegExp => { + if ((pattern as string).includes?.("?; + }); + afterAll(() => { + jest.restoreAllMocks(); + }); + test("Fallback regex detects commong crawlers", () => { + USER_AGENT_COMMON.forEach((ua) => { + if (!isbotInstance(ua)) { + throw new Error(`Failed to detect ${ua} as bot`); + } + }); + }); + test("fallback detects gotchas as bots", () => { + USER_AGENT_GOTCHAS.forEach((ua) => { + if (!isbotInstance(ua)) { + throw new Error(`Failed to detect ${ua} as bot (gotcha)`); + } + }); + }); + test("fallback does not detect browser as bot", () => { + expect(isbotInstance(BROWSER_USER_AGENT_EXAMPLE)).toBe(false); + }); + }); + describe("fixtures", () => { test(`✔︎ ${crawlers.length} user agent string should be recognised as crawler`, () => { let successCount = 0; @@ -107,4 +178,13 @@ describe("isbot", () => { expect(successCount).toBe(browsers.length); }); }); + + describe("module interface", () => { + test("interface is as expected", async () => { + const types = Object.entries(await import("../../src/index")).map( + ([key, value]) => [key, typeof value] as [string, string], + ); + expect(types).toMatchSnapshot(); + }); + }); }); From f410434cf72f53bec7b86c3e658bbb42ef79435c Mon Sep 17 00:00:00 2001 From: omrilotan Date: Wed, 10 Jan 2024 16:09:40 +0000 Subject: [PATCH 2/8] Separate pattern and regex so that they can be tree-shaken according to use --- README.md | 2 +- scripts/build/pattern.js | 5 ++++- src/index.ts | 33 +++++++++++++++++---------------- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index a3af73b..f6379a0 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ Using JSDeliver CDN you can import an iife script ## How `isbot` maintains accuracy -> `isbot`'s main feature is the accurate identification of bots using a regular expression. It uses expansive and regularly updated lists of user agent strings to create a regular expression that matches bots and only bots. +> `isbot`'s prized possession is the accurate identification of bots using a regular expression. It uses expansive and regularly updated lists of user agent strings to create a regular expression that matches bots and only bots. > > This is done by using a lookbehind pattern which is not supported in all environments. A fallback is provided for environments that do not support lookbehind which is less accurate. The test suite includes a percentage of false positives and false negatives which is deemed acceptable for the fallback: 1% false positive and 75% bot coverage. diff --git a/scripts/build/pattern.js b/scripts/build/pattern.js index 1acfda9..ce48981 100755 --- a/scripts/build/pattern.js +++ b/scripts/build/pattern.js @@ -11,5 +11,8 @@ const pattern = new RegExp( .toString() .slice(1, -1); -const code = `export const expression: string = "${pattern}";`; +const code = [ + `export const fullPattern: string = "${pattern}";`, + `export const regularExpression: RegExp = /${pattern}/i;` +].join("\n"); await writeFile("src/pattern.ts", code); diff --git a/src/index.ts b/src/index.ts index 838e47c..f16b218 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,25 +1,17 @@ -import { expression } from "./pattern"; +import { fullPattern, regularExpression } from "./pattern"; import patternsList from "./patterns.json"; /** * Naive bot pattern. */ -const naiveExpression = /bot|spider|crawl|http|lighthouse/i; +const naivePattern = /bot|spider|crawl|http|lighthouse/i; + +// Workaround for TypeScript's type definition of imported variables and JSON files. /** * A pattern that matches bot identifiers in user agent strings. */ -let regex: RegExp; - -try { - regex = new RegExp(expression, "i"); -} catch (error) { - regex = naiveExpression; -} - -export const pattern = regex; - -// Workaround for TypeScript's type definition of imported variables and JSON files. +export const pattern = regularExpression; /** * A list of bot identifiers to be used in a regular expression against user agent strings. @@ -30,13 +22,22 @@ export const list: string[] = patternsList; * Check if the given user agent includes a bot pattern. */ export const isbotNaive = (userAgent?: string | null): boolean => - Boolean(userAgent) && naiveExpression.test(userAgent); + Boolean(userAgent) && naivePattern.test(userAgent); /** * Check if the given user agent includes a bot pattern. */ -export const isbot = (userAgent?: string | null): boolean => - Boolean(userAgent) && pattern.test(userAgent); +let usedPattern: RegExp; +export function isbot(userAgent?: string | null): boolean { + if (typeof usedPattern === "undefined") { + try { + usedPattern = new RegExp(fullPattern, "i"); + } catch (error) { + usedPattern = naivePattern; + } + } + return Boolean(userAgent) && usedPattern.test(userAgent); +} /** * Create a custom isbot function with a custom pattern. From 916ceb68980581c1dce97626356616fce6ed4afe Mon Sep 17 00:00:00 2001 From: omrilotan Date: Wed, 10 Jan 2024 16:12:27 +0000 Subject: [PATCH 3/8] code formatting --- scripts/build/pattern.js | 2 +- src/index.ts | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/build/pattern.js b/scripts/build/pattern.js index ce48981..9d23215 100755 --- a/scripts/build/pattern.js +++ b/scripts/build/pattern.js @@ -13,6 +13,6 @@ const pattern = new RegExp( const code = [ `export const fullPattern: string = "${pattern}";`, - `export const regularExpression: RegExp = /${pattern}/i;` + `export const regularExpression: RegExp = /${pattern}/i;`, ].join("\n"); await writeFile("src/pattern.ts", code); diff --git a/src/index.ts b/src/index.ts index f16b218..b17cf32 100644 --- a/src/index.ts +++ b/src/index.ts @@ -29,14 +29,14 @@ export const isbotNaive = (userAgent?: string | null): boolean => */ let usedPattern: RegExp; export function isbot(userAgent?: string | null): boolean { - if (typeof usedPattern === "undefined") { - try { - usedPattern = new RegExp(fullPattern, "i"); - } catch (error) { - usedPattern = naivePattern; - } - } - return Boolean(userAgent) && usedPattern.test(userAgent); + if (typeof usedPattern === "undefined") { + try { + usedPattern = new RegExp(fullPattern, "i"); + } catch (error) { + usedPattern = naivePattern; + } + } + return Boolean(userAgent) && usedPattern.test(userAgent); } /** From e10e319563467a898c5ef0b53bbad514a02fc268 Mon Sep 17 00:00:00 2001 From: omrilotan Date: Wed, 10 Jan 2024 16:18:05 +0000 Subject: [PATCH 4/8] fix pattern build --- scripts/build/pattern.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/build/pattern.js b/scripts/build/pattern.js index 9d23215..63e806c 100755 --- a/scripts/build/pattern.js +++ b/scripts/build/pattern.js @@ -11,8 +11,10 @@ const pattern = new RegExp( .toString() .slice(1, -1); +const expression = new RegExp(patterns.join("|"), "i").toString(); + const code = [ `export const fullPattern: string = "${pattern}";`, - `export const regularExpression: RegExp = /${pattern}/i;`, + `export const regularExpression: RegExp = ${expression};`, ].join("\n"); await writeFile("src/pattern.ts", code); From e4740adb5ff69f2f122fc67df373d691cfc2ec3d Mon Sep 17 00:00:00 2001 From: omrilotan Date: Wed, 10 Jan 2024 16:29:01 +0000 Subject: [PATCH 5/8] Check isbot clean build (isbot only) with ES6 (2015) --- package.json | 2 +- scripts/test/procedure.sh | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100755 scripts/test/procedure.sh diff --git a/package.json b/package.json index 72d76fa..d0ba56e 100644 --- a/package.json +++ b/package.json @@ -50,7 +50,7 @@ "build": "./scripts/build/procedure.sh", "format": "./scripts/format/procedure.sh", "pretest": "npm run build && npm run prepare", - "test": "node --expose-gc node_modules/.bin/jest --verbose", + "test": "./scripts/test/procedure.sh", "prepublishOnly": "./scripts/prepublish/procedure.sh", "prestart": "which parcel || npm i parcel-bundler --no-save", "start": "parcel page/index.pug --out-dir docs", diff --git a/scripts/test/procedure.sh b/scripts/test/procedure.sh new file mode 100755 index 0000000..5cdc764 --- /dev/null +++ b/scripts/test/procedure.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +failures=0 + +node --expose-gc node_modules/.bin/jest --verbose +failures=$((failures + $?)) + +echo $(which es-check) +if [[ -z $(which es-check) ]]; then + echo "es-check not found. install locally." + npm install es-check --no-save + failures=$((failures + $?)) +fi + +es-check es2015 index.iife.js +failures=$((failures + $?)) + +echo -e "→ Number of failures: ${failures}" +exit $failures From 8a40d9fc268ec3588352a1c25d94d2b8e12a74f1 Mon Sep 17 00:00:00 2001 From: omrilotan Date: Wed, 10 Jan 2024 17:09:50 +0000 Subject: [PATCH 6/8] Add sideEffects flag for webpack and friends --- package.json | 1 + src/index.ts | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index d0ba56e..297d867 100644 --- a/package.json +++ b/package.json @@ -44,6 +44,7 @@ "default": "./index.js" } }, + "sideEffects": false, "types": "index.d.ts", "scripts": { "prepare": "./scripts/prepare/index.js", diff --git a/src/index.ts b/src/index.ts index b17cf32..1de7b7a 100644 --- a/src/index.ts +++ b/src/index.ts @@ -19,15 +19,15 @@ export const pattern = regularExpression; export const list: string[] = patternsList; /** - * Check if the given user agent includes a bot pattern. + * Check if the given user agent includes a bot pattern. Naive implementation (less accurate). */ export const isbotNaive = (userAgent?: string | null): boolean => Boolean(userAgent) && naivePattern.test(userAgent); +let usedPattern: RegExp; /** * Check if the given user agent includes a bot pattern. */ -let usedPattern: RegExp; export function isbot(userAgent?: string | null): boolean { if (typeof usedPattern === "undefined") { try { From cb982323c07c63855c38373b4c447768624ce74b Mon Sep 17 00:00:00 2001 From: omrilotan Date: Wed, 10 Jan 2024 17:35:36 +0000 Subject: [PATCH 7/8] Add tests to see all pattern uses are the same --- scripts/build/pattern.js | 4 +--- src/index.ts | 1 + tests/spec/test.ts | 7 ++++++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/build/pattern.js b/scripts/build/pattern.js index 63e806c..2f9e5d4 100755 --- a/scripts/build/pattern.js +++ b/scripts/build/pattern.js @@ -7,9 +7,7 @@ const pattern = new RegExp( patterns .map((pattern) => pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")) .join("|"), -) - .toString() - .slice(1, -1); +).source; const expression = new RegExp(patterns.join("|"), "i").toString(); diff --git a/src/index.ts b/src/index.ts index 1de7b7a..2dff947 100644 --- a/src/index.ts +++ b/src/index.ts @@ -31,6 +31,7 @@ let usedPattern: RegExp; export function isbot(userAgent?: string | null): boolean { if (typeof usedPattern === "undefined") { try { + // Build this RegExp dynamically to avoid syntax errors in older engines. usedPattern = new RegExp(fullPattern, "i"); } catch (error) { usedPattern = naivePattern; diff --git a/tests/spec/test.ts b/tests/spec/test.ts index 51a9f4a..a86edd6 100644 --- a/tests/spec/test.ts +++ b/tests/spec/test.ts @@ -10,6 +10,7 @@ import { createIsbot, createIsbotFromList, } from "../../src"; +import { fullPattern, regularExpression } from "../../src/pattern"; import { crawlers, browsers } from "../../fixtures"; let isbotInstance: any; @@ -131,7 +132,7 @@ describe("isbot", () => { afterAll(() => { jest.restoreAllMocks(); }); - test("Fallback regex detects commong crawlers", () => { + test("fallback regex detects commong crawlers", () => { USER_AGENT_COMMON.forEach((ua) => { if (!isbotInstance(ua)) { throw new Error(`Failed to detect ${ua} as bot`); @@ -186,5 +187,9 @@ describe("isbot", () => { ); expect(types).toMatchSnapshot(); }); + test("regular expressions exports are as expected", () => { + expect(pattern).toBe(regularExpression); + expect(new RegExp(fullPattern, "i").toString()).toBe(pattern.toString()); + }); }); }); From 5c23d8a34fbae20d436bd15769ce016a7afbffed Mon Sep 17 00:00:00 2001 From: omrilotan Date: Wed, 10 Jan 2024 17:51:45 +0000 Subject: [PATCH 8/8] A more accurate interface snapshot --- scripts/test/procedure.sh | 2 +- tests/spec/__snapshots__/test.ts.snap | 20 ++++++++++---------- tests/spec/test.ts | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/scripts/test/procedure.sh b/scripts/test/procedure.sh index 5cdc764..7a9cefb 100755 --- a/scripts/test/procedure.sh +++ b/scripts/test/procedure.sh @@ -2,7 +2,7 @@ failures=0 -node --expose-gc node_modules/.bin/jest --verbose +node --expose-gc node_modules/.bin/jest --verbose $@ failures=$((failures + $?)) echo $(which es-check) diff --git a/tests/spec/__snapshots__/test.ts.snap b/tests/spec/__snapshots__/test.ts.snap index 323c894..e6a174a 100644 --- a/tests/spec/__snapshots__/test.ts.snap +++ b/tests/spec/__snapshots__/test.ts.snap @@ -4,43 +4,43 @@ exports[`isbot module interface interface is as expected 1`] = ` [ [ "pattern", - "object", + "RegExp", ], [ "list", - "object", + "Array", ], [ "isbotNaive", - "function", + "Function", ], [ "isbot", - "function", + "Function", ], [ "createIsbot", - "function", + "Function", ], [ "createIsbotFromList", - "function", + "Function", ], [ "isbotMatch", - "function", + "Function", ], [ "isbotMatches", - "function", + "Function", ], [ "isbotPattern", - "function", + "Function", ], [ "isbotPatterns", - "function", + "Function", ], ] `; diff --git a/tests/spec/test.ts b/tests/spec/test.ts index a86edd6..f21eb6c 100644 --- a/tests/spec/test.ts +++ b/tests/spec/test.ts @@ -183,7 +183,7 @@ describe("isbot", () => { describe("module interface", () => { test("interface is as expected", async () => { const types = Object.entries(await import("../../src/index")).map( - ([key, value]) => [key, typeof value] as [string, string], + ([key, value]) => [key, value.constructor.name] as [string, string], ); expect(types).toMatchSnapshot(); });