diff --git a/server/finders/BookFinder.js b/server/finders/BookFinder.js index bcf8ea0681..a0b64f5521 100644 --- a/server/finders/BookFinder.js +++ b/server/finders/BookFinder.js @@ -59,12 +59,17 @@ class BookFinder { // Remove single quotes (i.e. "Ender's Game" becomes "Enders Game") cleaned = cleaned.replace(/'/g, '') - return this.replaceAccentedChars(cleaned) + return this.replaceAccentedChars(cleaned).toLowerCase() } cleanAuthorForCompares(author) { if (!author) return '' - return this.replaceAccentedChars(author) + let cleanAuthor = this.replaceAccentedChars(author).toLowerCase() + // separate initials + cleanAuthor = cleanAuthor.replace(/([a-z])\.([a-z])/g, '$1. $2') + // remove middle initials + cleanAuthor = cleanAuthor.replace(/(?<=\w\w)(\s+[a-z]\.?)+(?=\s+\w\w)/g, '') + return cleanAuthor } filterSearchResults(books, title, author, maxTitleDistance, maxAuthorDistance) { @@ -136,6 +141,10 @@ class BookFinder { if (!booksFiltered.length && books.length) { if (this.verbose) Logger.debug(`Search has ${books.length} matches, but no close title matches`) } + booksFiltered.sort((a, b) => { + return a.totalDistance - b.totalDistance + }) + return booksFiltered } @@ -179,35 +188,152 @@ class BookFinder { return books } - addTitleCandidate(title, candidates) { - // Main variant - const cleanTitle = this.cleanTitleForCompares(title).trim() - if (!cleanTitle) return - candidates.add(cleanTitle) - - let candidate = cleanTitle - - // Remove subtitle - candidate = candidate.replace(/([,:;_]| by ).*/g, "").trim() - if (candidate) - candidates.add(candidate) - - // Remove preceding/trailing numbers - candidate = candidate.replace(/^\d+ | \d+$/g, "").trim() - if (candidate) - candidates.add(candidate) - - // Remove bitrate - candidate = candidate.replace(/(^| )\d+k(bps)?( |$)/, " ").trim() - if (candidate) - candidates.add(candidate) - - // Remove edition - candidate = candidate.replace(/ (2nd|3rd|\d+th)\s+ed(\.|ition)?/, "").trim() - if (candidate) - candidates.add(candidate) + static TitleCandidates = class { + + constructor(bookFinder, cleanAuthor) { + this.bookFinder = bookFinder + this.candidates = new Set() + this.cleanAuthor = cleanAuthor + this.priorities = {} + this.positions = {} + } + + add(title, position = 0) { + // if title contains the author, remove it + if (this.cleanAuthor) { + const authorRe = new RegExp(`(^| | by |)${this.cleanAuthor}(?= |$)`, "g") + title = this.bookFinder.cleanAuthorForCompares(title).replace(authorRe, '').trim() + } + + const titleTransformers = [ + [/([,:;_]| by ).*/g, ''], // Remove subtitle + [/(^| )\d+k(bps)?( |$)/, ' '], // Remove bitrate + [/ (2nd|3rd|\d+th)\s+ed(\.|ition)?/g, ''], // Remove edition + [/(^| |\.)(m4b|m4a|mp3)( |$)/g, ''], // Remove file-type + [/ a novel.*$/g, ''], // Remove "a novel" + [/^\d+ | \d+$/g, ''], // Remove preceding/trailing numbers + ] + + // Main variant + const cleanTitle = this.bookFinder.cleanTitleForCompares(title).trim() + if (!cleanTitle) return + this.candidates.add(cleanTitle) + this.priorities[cleanTitle] = 0 + this.positions[cleanTitle] = position + + let candidate = cleanTitle + + for (const transformer of titleTransformers) + candidate = candidate.replace(transformer[0], transformer[1]).trim() + + if (candidate != cleanTitle) { + if (candidate) { + this.candidates.add(candidate) + this.priorities[candidate] = 0 + this.positions[candidate] = position + } + this.priorities[cleanTitle] = 1 + } + } + + get size() { + return this.candidates.size + } + + getCandidates() { + var candidates = [...this.candidates] + candidates.sort((a, b) => { + // Candidates that include the author are likely low quality + const includesAuthorDiff = !b.includes(this.cleanAuthor) - !a.includes(this.cleanAuthor) + if (includesAuthorDiff) return includesAuthorDiff + // Candidates that include only digits are also likely low quality + const onlyDigits = /^\d+$/ + const includesOnlyDigitsDiff = !onlyDigits.test(b) - !onlyDigits.test(a) + if (includesOnlyDigitsDiff) return includesOnlyDigitsDiff + // transformed candidates receive higher priority + const priorityDiff = this.priorities[a] - this.priorities[b] + if (priorityDiff) return priorityDiff + // if same priorirty, prefer candidates that are closer to the beginning (e.g. titles before subtitles) + const positionDiff = this.positions[a] - this.positions[b] + if (positionDiff) return positionDiff + // Start with longer candidaets, as they are likely more specific + const lengthDiff = b.length - a.length + if (lengthDiff) return lengthDiff + return b.localeCompare(a) + }) + Logger.debug(`[${this.constructor.name}] Found ${candidates.length} fuzzy title candidates`) + Logger.debug(candidates) + return candidates + } + + delete(title) { + return this.candidates.delete(title) + } } + static AuthorCandidates = class { + constructor(bookFinder, cleanAuthor) { + this.bookFinder = bookFinder + this.candidates = new Set() + this.cleanAuthor = cleanAuthor + if (cleanAuthor) this.candidates.add(cleanAuthor) + } + + validateAuthor(name, region = '', maxLevenshtein = 2) { + return this.bookFinder.audnexus.authorASINsRequest(name, region).then((asins) => { + for (const [i, asin] of asins.entries()) { + if (i > 10) break + let cleanName = this.bookFinder.cleanAuthorForCompares(asin.name) + if (!cleanName) continue + if (cleanName.includes(name)) return name + if (name.includes(cleanName)) return cleanName + if (levenshteinDistance(cleanName, name) <= maxLevenshtein) return cleanName + } + return '' + }) + } + + add(author) { + const cleanAuthor = this.bookFinder.cleanAuthorForCompares(author).trim() + if (!cleanAuthor) return + this.candidates.add(cleanAuthor) + } + + get size() { + return this.candidates.size + } + + get agressivelyCleanAuthor() { + if (this.cleanAuthor) { + const agressivelyCleanAuthor = this.cleanAuthor.replace(/[,/-].*$/, '').trim() + return agressivelyCleanAuthor ? agressivelyCleanAuthor : this.cleanAuthor + } + return '' + } + + async getCandidates() { + var filteredCandidates = [] + var promises = [] + for (const candidate of this.candidates) { + promises.push(this.validateAuthor(candidate)) + } + const results = [...new Set(await Promise.all(promises))] + filteredCandidates = results.filter(author => author) + // If no valid candidates were found, add back an aggresively cleaned author version + if (!filteredCandidates.length && this.cleanAuthor) filteredCandidates.push(this.agressivelyCleanAuthor) + // Always add an empty author candidate + filteredCandidates.push('') + Logger.debug(`[${this.constructor.name}] Found ${filteredCandidates.length} fuzzy author candidates`) + Logger.debug(filteredCandidates) + return filteredCandidates + } + + delete(author) { + return this.candidates.delete(author) + } + } + + /** * Search for books including fuzzy searches * @@ -232,62 +358,36 @@ class BookFinder { books = await this.runSearch(title, author, provider, asin, maxTitleDistance, maxAuthorDistance) if (!books.length && maxFuzzySearches > 0) { - // normalize title and author + // Normalize title and author title = title.trim().toLowerCase() author = author?.trim().toLowerCase() || '' + const cleanAuthor = this.cleanAuthorForCompares(author) + // Now run up to maxFuzzySearches fuzzy searches - let candidates = new Set() - let cleanedAuthor = this.cleanAuthorForCompares(author) - this.addTitleCandidate(title, candidates) + let authorCandidates = new BookFinder.AuthorCandidates(this, cleanAuthor) - // remove parentheses and their contents, and replace with a separator - const cleanTitle = title.replace(/\[.*?\]|\(.*?\)|{.*?}/g, " - ") + // Remove underscores and parentheses with their contents, and replace with a separator + const cleanTitle = title.replace(/\[.*?\]|\(.*?\)|{.*?}|_/g, " - ") // Split title into hypen-separated parts const titleParts = cleanTitle.split(/ - | -|- /) - for (const titlePart of titleParts) { - this.addTitleCandidate(titlePart, candidates) - } - // We already searched for original title - if (author == cleanedAuthor) candidates.delete(title) - if (candidates.size > 0) { - candidates = [...candidates] - candidates.sort((a, b) => { - // Candidates that include the author are likely low quality - const includesAuthorDiff = !b.includes(cleanedAuthor) - !a.includes(cleanedAuthor) - if (includesAuthorDiff) return includesAuthorDiff - // Candidates that include only digits are also likely low quality - const onlyDigits = /^\d+$/ - const includesOnlyDigitsDiff = !onlyDigits.test(b) - !onlyDigits.test(a) - if (includesOnlyDigitsDiff) return includesOnlyDigitsDiff - // Start with longer candidaets, as they are likely more specific - const lengthDiff = b.length - a.length - if (lengthDiff) return lengthDiff - return b.localeCompare(a) - }) - Logger.debug(`[BookFinder] Found ${candidates.length} fuzzy title candidates`, candidates) - for (const candidate of candidates) { + for (const titlePart of titleParts) + authorCandidates.add(titlePart) + authorCandidates = await authorCandidates.getCandidates() + for (const authorCandidate of authorCandidates) { + let titleCandidates = new BookFinder.TitleCandidates(this, authorCandidate) + for (const [position, titlePart] of titleParts.entries()) + titleCandidates.add(titlePart, position) + titleCandidates = titleCandidates.getCandidates() + for (const titleCandidate of titleCandidates) { + if (titleCandidate == title && authorCandidate == author) continue // We already tried this if (++numFuzzySearches > maxFuzzySearches) return books - books = await this.runSearch(candidate, cleanedAuthor, provider, asin, maxTitleDistance, maxAuthorDistance) - if (books.length) break - } - if (!books.length) { - // Now try searching without the author - for (const candidate of candidates) { - if (++numFuzzySearches > maxFuzzySearches) return books - books = await this.runSearch(candidate, '', provider, asin, maxTitleDistance, maxAuthorDistance) - if (books.length) break - } + books = await this.runSearch(titleCandidate, authorCandidate, provider, asin, maxTitleDistance, maxAuthorDistance) + if (books.length) return books } } } - if (provider === 'openlibrary') { - books.sort((a, b) => { - return a.totalDistance - b.totalDistance - }) - } - return books }