Merge pull request #2186 from mikiher/Fuzzy-Matching-Continued

Fuzzy matching continued
advplyr · Oct 8, 2023 · 5ad9f50 · 5ad9f50
2 parents db9d5c9 + f8f555b
commit 5ad9f50
Showing 1 changed file with 173 additions and 73 deletions.
diff --git a/server/finders/BookFinder.js b/server/finders/BookFinder.js
@@ -59,12 +59,17 @@ class BookFinder {
 
     // Remove single quotes (i.e. "Ender's Game" becomes "Enders Game")
     cleaned = cleaned.replace(/'/g, '')
-    return this.replaceAccentedChars(cleaned)
+    return this.replaceAccentedChars(cleaned).toLowerCase()
   }
 
   cleanAuthorForCompares(author) {
     if (!author) return ''
-    return this.replaceAccentedChars(author)
+    let cleanAuthor = this.replaceAccentedChars(author).toLowerCase()
+    // separate initials
+    cleanAuthor = cleanAuthor.replace(/([a-z])\.([a-z])/g, '$1. $2')
+    // remove middle initials
+    cleanAuthor = cleanAuthor.replace(/(?<=\w\w)(\s+[a-z]\.?)+(?=\s+\w\w)/g, '')
+    return cleanAuthor
   }
 
   filterSearchResults(books, title, author, maxTitleDistance, maxAuthorDistance) {
@@ -136,6 +141,10 @@ class BookFinder {
     if (!booksFiltered.length && books.length) {
       if (this.verbose) Logger.debug(`Search has ${books.length} matches, but no close title matches`)
     }
+    booksFiltered.sort((a, b) => {
+      return a.totalDistance - b.totalDistance
+    })
+
     return booksFiltered
   }
 
@@ -179,35 +188,152 @@ class BookFinder {
     return books
   }
 
-  addTitleCandidate(title, candidates) {
-    // Main variant
-    const cleanTitle = this.cleanTitleForCompares(title).trim()
-    if (!cleanTitle) return
-    candidates.add(cleanTitle)
-
-    let candidate = cleanTitle
-
-    // Remove subtitle
-    candidate = candidate.replace(/([,:;_]| by ).*/g, "").trim()
-    if (candidate)
-      candidates.add(candidate)
-
-    // Remove preceding/trailing numbers
-    candidate = candidate.replace(/^\d+ | \d+$/g, "").trim()
-    if (candidate)
-      candidates.add(candidate)
-
-    // Remove bitrate
-    candidate = candidate.replace(/(^| )\d+k(bps)?( |$)/, " ").trim()
-    if (candidate)
-      candidates.add(candidate)
-
-    // Remove edition
-    candidate = candidate.replace(/ (2nd|3rd|\d+th)\s+ed(\.|ition)?/, "").trim()
-    if (candidate)
-      candidates.add(candidate)
+  static TitleCandidates = class {
+
+    constructor(bookFinder, cleanAuthor) {
+      this.bookFinder = bookFinder
+      this.candidates = new Set()
+      this.cleanAuthor = cleanAuthor
+      this.priorities = {}
+      this.positions = {}
+    }
+
+    add(title, position = 0) {
+      // if title contains the author, remove it
+      if (this.cleanAuthor) {
+        const authorRe = new RegExp(`(^| | by |)${this.cleanAuthor}(?= |$)`, "g")
+        title = this.bookFinder.cleanAuthorForCompares(title).replace(authorRe, '').trim()
+      }
+
+      const titleTransformers = [
+        [/([,:;_]| by ).*/g, ''],                  // Remove subtitle
+        [/(^| )\d+k(bps)?( |$)/, ' '],             // Remove bitrate
+        [/ (2nd|3rd|\d+th)\s+ed(\.|ition)?/g, ''], // Remove edition
+        [/(^| |\.)(m4b|m4a|mp3)( |$)/g, ''],       // Remove file-type
+        [/ a novel.*$/g, ''],                      // Remove "a novel"
+        [/^\d+ | \d+$/g, ''],                      // Remove preceding/trailing numbers
+      ]
+
+      // Main variant
+      const cleanTitle = this.bookFinder.cleanTitleForCompares(title).trim()
+      if (!cleanTitle) return
+      this.candidates.add(cleanTitle)
+      this.priorities[cleanTitle] = 0
+      this.positions[cleanTitle] = position
+
+      let candidate = cleanTitle
+
+      for (const transformer of titleTransformers)
+        candidate = candidate.replace(transformer[0], transformer[1]).trim()
+
+      if (candidate != cleanTitle) {
+        if (candidate) {
+          this.candidates.add(candidate)
+          this.priorities[candidate] = 0
+          this.positions[candidate] = position
+        }
+        this.priorities[cleanTitle] = 1
+      }
+    }
+
+    get size() {
+      return this.candidates.size
+    }
+
+    getCandidates() {
+      var candidates = [...this.candidates]
+      candidates.sort((a, b) => {
+        // Candidates that include the author are likely low quality
+        const includesAuthorDiff = !b.includes(this.cleanAuthor) - !a.includes(this.cleanAuthor)
+        if (includesAuthorDiff) return includesAuthorDiff
+        // Candidates that include only digits are also likely low quality
+        const onlyDigits = /^\d+$/
+        const includesOnlyDigitsDiff = !onlyDigits.test(b) - !onlyDigits.test(a)
+        if (includesOnlyDigitsDiff) return includesOnlyDigitsDiff
+        // transformed candidates receive higher priority
+        const priorityDiff = this.priorities[a] - this.priorities[b]
+        if (priorityDiff) return priorityDiff
+        // if same priorirty, prefer candidates that are closer to the beginning (e.g. titles before subtitles)
+        const positionDiff = this.positions[a] - this.positions[b]
+        if (positionDiff) return positionDiff
+        // Start with longer candidaets, as they are likely more specific
+        const lengthDiff = b.length - a.length
+        if (lengthDiff) return lengthDiff
+        return b.localeCompare(a)
+      })
+      Logger.debug(`[${this.constructor.name}] Found ${candidates.length} fuzzy title candidates`)
+      Logger.debug(candidates)
+      return candidates
+    }
+
+    delete(title) {
+      return this.candidates.delete(title)
+    }
   }
 
+  static AuthorCandidates = class {
+    constructor(bookFinder, cleanAuthor) {
+      this.bookFinder = bookFinder
+      this.candidates = new Set()
+      this.cleanAuthor = cleanAuthor
+      if (cleanAuthor) this.candidates.add(cleanAuthor)
+    }
+
+    validateAuthor(name, region = '', maxLevenshtein = 2) {
+      return this.bookFinder.audnexus.authorASINsRequest(name, region).then((asins) => {
+        for (const [i, asin] of asins.entries()) {
+          if (i > 10) break
+          let cleanName = this.bookFinder.cleanAuthorForCompares(asin.name)
+          if (!cleanName) continue
+          if (cleanName.includes(name)) return name
+          if (name.includes(cleanName)) return cleanName
+          if (levenshteinDistance(cleanName, name) <= maxLevenshtein) return cleanName
+        }
+        return ''
+      })
+    }
+
+    add(author) {
+      const cleanAuthor = this.bookFinder.cleanAuthorForCompares(author).trim()
+      if (!cleanAuthor) return
+      this.candidates.add(cleanAuthor)
+    }
+
+    get size() {
+      return this.candidates.size
+    }
+
+    get agressivelyCleanAuthor() {
+      if (this.cleanAuthor) {
+        const agressivelyCleanAuthor = this.cleanAuthor.replace(/[,/-].*$/, '').trim()
+        return agressivelyCleanAuthor ? agressivelyCleanAuthor : this.cleanAuthor
+      }
+      return ''
+    }
+
+    async getCandidates() {
+      var filteredCandidates = []
+      var promises = []
+      for (const candidate of this.candidates) {
+        promises.push(this.validateAuthor(candidate))
+      }
+      const results = [...new Set(await Promise.all(promises))]
+      filteredCandidates = results.filter(author => author)
+      // If no valid candidates were found, add back an aggresively cleaned author version
+      if (!filteredCandidates.length && this.cleanAuthor) filteredCandidates.push(this.agressivelyCleanAuthor)
+      // Always add an empty author candidate
+      filteredCandidates.push('')
+      Logger.debug(`[${this.constructor.name}] Found ${filteredCandidates.length} fuzzy author candidates`)
+      Logger.debug(filteredCandidates)
+      return filteredCandidates
+    }
+
+    delete(author) {
+      return this.candidates.delete(author)
+    }
+  }
+
+
   /**
    * Search for books including fuzzy searches
    * 
@@ -232,62 +358,36 @@ class BookFinder {
     books = await this.runSearch(title, author, provider, asin, maxTitleDistance, maxAuthorDistance)
 
     if (!books.length && maxFuzzySearches > 0) {
-      // normalize title and author
+      // Normalize title and author
       title = title.trim().toLowerCase()
       author = author?.trim().toLowerCase() || ''
 
+      const cleanAuthor = this.cleanAuthorForCompares(author)
+
       // Now run up to maxFuzzySearches fuzzy searches
-      let candidates = new Set()
-      let cleanedAuthor = this.cleanAuthorForCompares(author)
-      this.addTitleCandidate(title, candidates)
+      let authorCandidates = new BookFinder.AuthorCandidates(this, cleanAuthor)
 
-      // remove parentheses and their contents, and replace with a separator
-      const cleanTitle = title.replace(/\[.*?\]|\(.*?\)|{.*?}/g, " - ")
+      // Remove underscores and parentheses with their contents, and replace with a separator
+      const cleanTitle = title.replace(/\[.*?\]|\(.*?\)|{.*?}|_/g, " - ")
       // Split title into hypen-separated parts
       const titleParts = cleanTitle.split(/ - | -|- /)
-      for (const titlePart of titleParts) {
-        this.addTitleCandidate(titlePart, candidates)
-      }
-      // We already searched for original title
-      if (author == cleanedAuthor) candidates.delete(title)
-      if (candidates.size > 0) {
-        candidates = [...candidates]
-        candidates.sort((a, b) => {
-          // Candidates that include the author are likely low quality
-          const includesAuthorDiff = !b.includes(cleanedAuthor) - !a.includes(cleanedAuthor)
-          if (includesAuthorDiff) return includesAuthorDiff
-          // Candidates that include only digits are also likely low quality
-          const onlyDigits = /^\d+$/
-          const includesOnlyDigitsDiff = !onlyDigits.test(b) - !onlyDigits.test(a)
-          if (includesOnlyDigitsDiff) return includesOnlyDigitsDiff
-          // Start with longer candidaets, as they are likely more specific
-          const lengthDiff = b.length - a.length
-          if (lengthDiff) return lengthDiff
-          return b.localeCompare(a)
-        })
-        Logger.debug(`[BookFinder] Found ${candidates.length} fuzzy title candidates`, candidates)
-        for (const candidate of candidates) {
+      for (const titlePart of titleParts)
+        authorCandidates.add(titlePart)
+      authorCandidates = await authorCandidates.getCandidates()
+      for (const authorCandidate of authorCandidates) {
+        let titleCandidates = new BookFinder.TitleCandidates(this, authorCandidate)
+        for (const [position, titlePart] of titleParts.entries())
+          titleCandidates.add(titlePart, position)
+        titleCandidates = titleCandidates.getCandidates()
+        for (const titleCandidate of titleCandidates) {
+          if (titleCandidate == title && authorCandidate == author) continue // We already tried this
           if (++numFuzzySearches > maxFuzzySearches) return books
-          books = await this.runSearch(candidate, cleanedAuthor, provider, asin, maxTitleDistance, maxAuthorDistance)
-          if (books.length) break
-        }
-        if (!books.length) {
-          // Now try searching without the author
-          for (const candidate of candidates) {
-            if (++numFuzzySearches > maxFuzzySearches) return books
-            books = await this.runSearch(candidate, '', provider, asin, maxTitleDistance, maxAuthorDistance)
-            if (books.length) break
-          }
+          books = await this.runSearch(titleCandidate, authorCandidate, provider, asin, maxTitleDistance, maxAuthorDistance)
+          if (books.length) return books
         }
       }
     }
 
-    if (provider === 'openlibrary') {
-      books.sort((a, b) => {
-        return a.totalDistance - b.totalDistance
-      })
-    }
-
     return books
   }