From ac746f199b1fe51ba33cc70d9efe8bcdc6fea778 Mon Sep 17 00:00:00 2001 From: mikiher Date: Thu, 14 Sep 2023 21:32:20 +0000 Subject: [PATCH 1/3] Fuzzy Matching V1 --- server/finders/BookFinder.js | 128 +++++++++++++++++++++++++++++------ 1 file changed, 106 insertions(+), 22 deletions(-) diff --git a/server/finders/BookFinder.js b/server/finders/BookFinder.js index 6452bff0a8..307249df5b 100644 --- a/server/finders/BookFinder.js +++ b/server/finders/BookFinder.js @@ -60,13 +60,13 @@ class BookFinder { // Remove single quotes (i.e. "Ender's Game" becomes "Enders Game") cleaned = cleaned.replace(/'/g, '') cleaned = this.replaceAccentedChars(cleaned) - return cleaned.toLowerCase() + return cleaned } cleanAuthorForCompares(author) { if (!author) return '' var cleaned = this.replaceAccentedChars(author) - return cleaned.toLowerCase() + return cleaned } filterSearchResults(books, title, author, maxTitleDistance, maxAuthorDistance) { @@ -181,12 +181,113 @@ class BookFinder { return books } + addTitleCandidate(title, candidates) { + // Main variant + const cleanTitle = this.cleanTitleForCompares(title).trim() + if (!cleanTitle) return + candidates.add(cleanTitle) + + let candidate = cleanTitle + + // Remove subtitle + candidate = candidate.replace(/([,:;_]| by ).*/g, "").trim() + if (candidate) + candidates.add(candidate) + + // Remove preceding/trailing numbers + candidate = candidate.replace(/^\d+ | \d+$/g, "").trim() + if (candidate) + candidates.add(candidate) + + // Remove bitrate + candidate = candidate.replace(/(^| )\d+k(bps)?( |$)/, " ").trim() + if (candidate) + candidates.add(candidate) + + // Remove edition + candidate = candidate.replace(/ (2nd|3rd|\d+th)\s+ed(\.|ition)?/, "").trim() + if (candidate) + candidates.add(candidate) + } + async search(provider, title, author, isbn, asin, options = {}) { var books = [] - var maxTitleDistance = !isNaN(options.titleDistance) ? Number(options.titleDistance) : 4 - var maxAuthorDistance = !isNaN(options.authorDistance) ? Number(options.authorDistance) : 4 + const maxTitleDistance = !isNaN(options.titleDistance) ? Number(options.titleDistance) : 4 + const maxAuthorDistance = !isNaN(options.authorDistance) ? Number(options.authorDistance) : 4 + const maxFuzzySearches = 5 + var numFuzzySearches = 0 + + if (!title) + return books + + books = await this.runSearch(title, author, provider, asin, maxTitleDistance, maxAuthorDistance) + + if (!books.length && maxFuzzySearches > 0) { + // normalize title and author + title = title.trim().toLowerCase() + author = author.trim().toLowerCase() + + // Now run up to maxFuzzySearches fuzzy searches + var candidates = new Set() + var cleanedAuthor = this.cleanAuthorForCompares(author) + this.addTitleCandidate(title, candidates) + + // remove parentheses and their contents, and replace with a separator + const cleanTitle = title.replace(/\[.*?\]|\(.*?\)|{.*?}/g, " - ") + // Split title into hypen-separated parts + const titleParts = cleanTitle.split(/ - | -|- /) + for (const titlePart of titleParts) { + this.addTitleCandidate(titlePart, candidates) + } + // We already searched for original title + if (author == cleanedAuthor) candidates.delete(title) + if (candidates.size > 0) { + candidates = [...candidates] + candidates.sort((a, b) => { + // Candidates that include the author are likely low quality + const includesAuthorDiff = !b.includes(cleanedAuthor) - !a.includes(cleanedAuthor) + if (includesAuthorDiff) return includesAuthorDiff + // Candidates that include only digits are also likely low quality + const onlyDigits = /^\d+$/ + const includesOnlyDigitsDiff = !onlyDigits.test(b) - !onlyDigits.test(a) + if (includesOnlyDigitsDiff) return includesOnlyDigitsDiff + // Start with longer candidaets, as they are likely more specific + const lengthDiff = b.length - a.length + if (lengthDiff) return lengthDiff + return b.localeCompare(a) + }) + Logger.debug(`[BookFinder] Found ${candidates.length} fuzzy title candidates`) + Logger.debug(candidates) + for (const candidate of candidates) { + if (++numFuzzySearches > maxFuzzySearches) return books + books = await this.runSearch(candidate, cleanedAuthor, provider, asin, maxTitleDistance, maxAuthorDistance) + if (books.length) break + } + if (!books.length) { + // Now try searching without the author + for (const candidate of candidates) { + if (++numFuzzySearches > maxFuzzySearches) return books + books = await this.runSearch(candidate, '', provider, asin, maxTitleDistance, maxAuthorDistance) + if (books.length) break + } + } + } + } + + if (provider === 'openlibrary') { + books.sort((a, b) => { + return a.totalDistance - b.totalDistance + }) + } + + return books + } + + async runSearch(title, author, provider, asin, maxTitleDistance, maxAuthorDistance) { Logger.debug(`Book Search: title: "${title}", author: "${author || ''}", provider: ${provider}`) + var books = [] + if (provider === 'google') { books = await this.getGoogleBooksResults(title, author) } else if (provider.startsWith('audible')) { @@ -203,23 +304,6 @@ class BookFinder { else { books = await this.getGoogleBooksResults(title, author) } - - if (!books.length && !options.currentlyTryingCleaned) { - var cleanedTitle = this.cleanTitleForCompares(title) - var cleanedAuthor = this.cleanAuthorForCompares(author) - if (cleanedTitle == title && cleanedAuthor == author) return books - - Logger.debug(`Book Search, no matches.. checking cleaned title and author`) - options.currentlyTryingCleaned = true - return this.search(provider, cleanedTitle, cleanedAuthor, isbn, asin, options) - } - - if (provider === 'openlibrary') { - books.sort((a, b) => { - return a.totalDistance - b.totalDistance - }) - } - return books } @@ -253,4 +337,4 @@ class BookFinder { return this.audnexus.getChaptersByASIN(asin, region) } } -module.exports = new BookFinder() \ No newline at end of file +module.exports = new BookFinder() From 67bbe2151383803cc762991574c640cbefafd8f4 Mon Sep 17 00:00:00 2001 From: mikiher Date: Fri, 15 Sep 2023 09:24:19 +0000 Subject: [PATCH 2/3] Make quick-match more conservative --- server/finders/BookFinder.js | 2 +- server/scanner/Scanner.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/server/finders/BookFinder.js b/server/finders/BookFinder.js index 307249df5b..18865f2b77 100644 --- a/server/finders/BookFinder.js +++ b/server/finders/BookFinder.js @@ -214,7 +214,7 @@ class BookFinder { var books = [] const maxTitleDistance = !isNaN(options.titleDistance) ? Number(options.titleDistance) : 4 const maxAuthorDistance = !isNaN(options.authorDistance) ? Number(options.authorDistance) : 4 - const maxFuzzySearches = 5 + const maxFuzzySearches = !isNaN(options.maxFuzzySearches) ? Number(options.maxFuzzySearches) : 5 var numFuzzySearches = 0 if (!title) diff --git a/server/scanner/Scanner.js b/server/scanner/Scanner.js index ee1fcc3742..d2037f8b9f 100644 --- a/server/scanner/Scanner.js +++ b/server/scanner/Scanner.js @@ -36,7 +36,7 @@ class Scanner { var searchISBN = options.isbn || libraryItem.media.metadata.isbn var searchASIN = options.asin || libraryItem.media.metadata.asin - var results = await BookFinder.search(provider, searchTitle, searchAuthor, searchISBN, searchASIN) + var results = await BookFinder.search(provider, searchTitle, searchAuthor, searchISBN, searchASIN, { maxFuzzySearches: 2 }) if (!results.length) { return { warning: `No ${provider} match found` From 61c48602e86abade6b186f0af6d2d9326f0f24c4 Mon Sep 17 00:00:00 2001 From: advplyr Date: Fri, 22 Sep 2023 16:03:41 -0500 Subject: [PATCH 3/3] Add jsdocs to BookFinder search functions --- server/finders/BookFinder.js | 45 +++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/server/finders/BookFinder.js b/server/finders/BookFinder.js index 18865f2b77..96735cc958 100644 --- a/server/finders/BookFinder.js +++ b/server/finders/BookFinder.js @@ -52,21 +52,19 @@ class BookFinder { cleanTitleForCompares(title) { if (!title) return '' // Remove subtitle if there (i.e. "Cool Book: Coolest Ever" becomes "Cool Book") - var stripped = this.stripSubtitle(title) + let stripped = this.stripSubtitle(title) // Remove text in paranthesis (i.e. "Ender's Game (Ender's Saga)" becomes "Ender's Game") - var cleaned = stripped.replace(/ *\([^)]*\) */g, "") + let cleaned = stripped.replace(/ *\([^)]*\) */g, "") // Remove single quotes (i.e. "Ender's Game" becomes "Enders Game") cleaned = cleaned.replace(/'/g, '') - cleaned = this.replaceAccentedChars(cleaned) - return cleaned + return this.replaceAccentedChars(cleaned) } cleanAuthorForCompares(author) { if (!author) return '' - var cleaned = this.replaceAccentedChars(author) - return cleaned + return this.replaceAccentedChars(author) } filterSearchResults(books, title, author, maxTitleDistance, maxAuthorDistance) { @@ -210,12 +208,23 @@ class BookFinder { candidates.add(candidate) } + /** + * Search for books including fuzzy searches + * + * @param {string} provider + * @param {string} title + * @param {string} author + * @param {string} isbn + * @param {string} asin + * @param {{titleDistance:number, authorDistance:number, maxFuzzySearches:number}} options + * @returns {Promise} + */ async search(provider, title, author, isbn, asin, options = {}) { - var books = [] + let books = [] const maxTitleDistance = !isNaN(options.titleDistance) ? Number(options.titleDistance) : 4 const maxAuthorDistance = !isNaN(options.authorDistance) ? Number(options.authorDistance) : 4 const maxFuzzySearches = !isNaN(options.maxFuzzySearches) ? Number(options.maxFuzzySearches) : 5 - var numFuzzySearches = 0 + let numFuzzySearches = 0 if (!title) return books @@ -228,8 +237,8 @@ class BookFinder { author = author.trim().toLowerCase() // Now run up to maxFuzzySearches fuzzy searches - var candidates = new Set() - var cleanedAuthor = this.cleanAuthorForCompares(author) + let candidates = new Set() + let cleanedAuthor = this.cleanAuthorForCompares(author) this.addTitleCandidate(title, candidates) // remove parentheses and their contents, and replace with a separator @@ -256,8 +265,7 @@ class BookFinder { if (lengthDiff) return lengthDiff return b.localeCompare(a) }) - Logger.debug(`[BookFinder] Found ${candidates.length} fuzzy title candidates`) - Logger.debug(candidates) + Logger.debug(`[BookFinder] Found ${candidates.length} fuzzy title candidates`, candidates) for (const candidate of candidates) { if (++numFuzzySearches > maxFuzzySearches) return books books = await this.runSearch(candidate, cleanedAuthor, provider, asin, maxTitleDistance, maxAuthorDistance) @@ -283,10 +291,21 @@ class BookFinder { return books } + /** + * Search for books + * + * @param {string} title + * @param {string} author + * @param {string} provider + * @param {string} asin only used for audible providers + * @param {number} maxTitleDistance only used for openlibrary provider + * @param {number} maxAuthorDistance only used for openlibrary provider + * @returns {Promise} + */ async runSearch(title, author, provider, asin, maxTitleDistance, maxAuthorDistance) { Logger.debug(`Book Search: title: "${title}", author: "${author || ''}", provider: ${provider}`) - var books = [] + let books = [] if (provider === 'google') { books = await this.getGoogleBooksResults(title, author)