JabRef · tobiasdiez · Dec 28, 2020 · Dec 18, 2020 · Dec 18, 2020 · Dec 18, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -80,6 +80,8 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
 - We fixed an issue where the password for a shared SQL database was not remembered [#6869](https://github.com/JabRef/jabref/issues/6869)
 - We fixed an issue where newly added entires were not synced to a shared SQL database [#7176](https://github.com/JabRef/jabref/issues/7176)
 - We fixed an issue where the PDF-Content importer threw an exception when no DOI number is present at the first page of the PDF document [#7203](https://github.com/JabRef/jabref/issues/7203)
+- We fixed an issue where authors that only have last names were incorrectly identified as institutes when generating citation keys [#7199](https://github.com/JabRef/jabref/issues/7199)
+- We fixed an issue where institutes were incorrectly identified as universities when generating citation keys [#6942](https://github.com/JabRef/jabref/issues/6942)
 
 ### Removed
 

diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java
@@ -57,6 +57,7 @@
     requires reactfx;
     requires commons.cli;
     requires com.github.tomtung.latex2unicode;
+    requires fastparse;
     requires jbibtex;
     requires citeproc.java;
     requires antlr.runtime;

diff --git a/src/main/java/org/jabref/logic/citationkeypattern/BracketedPattern.java b/src/main/java/org/jabref/logic/citationkeypattern/BracketedPattern.java
@@ -31,6 +31,7 @@
 import org.jabref.model.entry.field.InternalField;
 import org.jabref.model.entry.field.StandardField;
 import org.jabref.model.strings.LatexToUnicodeAdapter;
+import org.jabref.model.strings.StringUtil;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -59,13 +60,14 @@ public class BracketedPattern {
      */
     private static final Pattern NOT_CAPITAL_CHARACTER = Pattern.compile("[^A-Z]");
     /**
-     * Matches with "({[A-Z]}+)", which should be used to abbreviate the name of an institution
+     * Matches uppercase english letters between "({" and "})", which should be used to abbreviate the name of an institution
      */
-    private static final Pattern ABBREVIATIONS = Pattern.compile(".*\\(\\{[A-Z]+}\\).*");
+    private static final Pattern INLINE_ABBREVIATION = Pattern.compile("(?<=\\(\\{)[A-Z]+(?=}\\))");
     /**
      * Matches with "dep"/"dip", case insensitive
      */
     private static final Pattern DEPARTMENTS = Pattern.compile("^d[ei]p.*", Pattern.CASE_INSENSITIVE);
+    private static final Pattern WHITESPACE = Pattern.compile("\\p{javaWhitespace}");
 
     private enum Institution {
         SCHOOL,
@@ -74,9 +76,9 @@ private enum Institution {
         TECHNOLOGY;
 
         /**
-         * Matches "uni" at the start of a string or after a space, case insensitive
+         * Matches "uni" followed by "v" or "b", at the start of a string or after a space, case insensitive
          */
-        private static final Pattern UNIVERSITIES = Pattern.compile("^uni.*", Pattern.CASE_INSENSITIVE);
+        private static final Pattern UNIVERSITIES = Pattern.compile("^uni(v|b|$).*", Pattern.CASE_INSENSITIVE);
         /**
          * Matches with "tech", case insensitive
          */
@@ -492,9 +494,9 @@ private static AuthorList createAuthorList(String unparsedAuthors) {
         for (Author author : AuthorList.parse(unparsedAuthors).getAuthors()) {
             // If the author is an institution, use an institution key instead of the full name
             String lastName = author.getLast()
-                                    .map(LatexToUnicodeAdapter::format)
-                                    .map(isInstitution(author) ?
-                                            BracketedPattern::generateInstitutionKey : Function.identity())
+                                    .map(lastPart -> isInstitution(author) ?
+                                            generateInstitutionKey(lastPart) :
+                                            LatexToUnicodeAdapter.format(lastPart))
                                     .orElse(null);
             authorList.addAuthor(
                     author.getFirst().map(LatexToUnicodeAdapter::format).orElse(null),
@@ -508,14 +510,15 @@ private static AuthorList createAuthorList(String unparsedAuthors) {
     }
 
     /**
-     * Checks if an author is an institution by verifying that only the last name is present.
+     * Checks if an author is an institution which can get a citation key from {@link #generateInstitutionKey(String)}.
      *
      * @param author the checked author
-     * @return true if only the last name is present
+     * @return true if only the last name is present and it contains at least one whitespace character.
      */
     private static boolean isInstitution(Author author) {
         return author.getFirst().isEmpty() && author.getFirstAbbr().isEmpty() && author.getJr().isEmpty()
-                && author.getVon().isEmpty() && author.getLast().isPresent();
+                && author.getVon().isEmpty() && author.getLast().isPresent()
+                && WHITESPACE.matcher(author.getLast().get()).find();
     }
 
     /**
@@ -658,52 +661,31 @@ public static String camelizeSignificantWordsInTitle(String title) {
     }
 
     public static String removeSmallWords(String title) {
-        StringJoiner stringJoiner = new StringJoiner(" ");
         String formattedTitle = formatTitle(title);
 
         try (Scanner titleScanner = new Scanner(formattedTitle)) {
-            mainl:
-            while (titleScanner.hasNext()) {
-                String word = titleScanner.next();
-
-                for (String smallWord : Word.SMALLER_WORDS) {
-                    if (word.equalsIgnoreCase(smallWord)) {
-                        continue mainl;
-                    }
-                }
-
-                stringJoiner.add(word);
-            }
+            return titleScanner.tokens()
+                               .filter(Predicate.not(
+                                       Word::isSmallerWord))
+                               .collect(Collectors.joining(" "));
         }
-
-        return stringJoiner.toString();
     }
 
     private static String getTitleWordsWithSpaces(int number, String title) {
-        StringJoiner stringJoiner = new StringJoiner(" ");
         String formattedTitle = formatTitle(title);
-        int words = 0;
 
         try (Scanner titleScanner = new Scanner(formattedTitle)) {
-            while (titleScanner.hasNext() && (words < number)) {
-                String word = titleScanner.next();
-
-                stringJoiner.add(word);
-                words++;
-            }
+            return titleScanner.tokens()
+                               .limit(number)
+                               .collect(Collectors.joining(" "));
         }
-
-        return stringJoiner.toString();
     }
 
     private static String keepLettersAndDigitsOnly(String in) {
-        StringBuilder stringBuilder = new StringBuilder();
-        for (int i = 0; i < in.length(); i++) {
-            if (Character.isLetterOrDigit(in.charAt(i))) {
-                stringBuilder.append(in.charAt(i));
-            }
-        }
-        return stringBuilder.toString();
+        return in.codePoints()
+                 .filter(Character::isLetterOrDigit)
+                 .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
+                 .toString();
     }
 
     /**
@@ -1131,51 +1113,6 @@ protected static List<String> parseFieldAndModifiers(String arg) {
         return parts;
     }
 
-    /**
-     * Will remove diacritics from the content.
-     * <ul>
-     * <li>Replaces umlaut: \"x with xe, e.g. \"o -> oe, \"u -> ue, etc.</li>
-     * <li>Removes all other diacritics: \?x -> x, e.g. \'a -> a, etc.</li>
-     * </ul>
-     *
-     * @param content The content.
-     * @return The content without diacritics.
-     */
-    private static String removeDiacritics(String content) {
-        if (content.isEmpty()) {
-            return content;
-        }
-
-        String result = content;
-        // Replace umlaut with '?e'
-        result = result.replaceAll("\\{\\\\\"([a-zA-Z])\\}", "$1e");
-        result = result.replaceAll("\\\\\"\\{([a-zA-Z])\\}", "$1e");
-        result = result.replaceAll("\\\\\"([a-zA-Z])", "$1e");
-        // Remove diacritics
-        result = result.replaceAll("\\{\\\\.([a-zA-Z])\\}", "$1");
-        result = result.replaceAll("\\\\.\\{([a-zA-Z])\\}", "$1");
-        result = result.replaceAll("\\\\.([a-zA-Z])", "$1");
-        return result;
-    }
-
-    /**
-     * Unifies umlauts.
-     * <ul>
-     * <li>Replaces: $\ddot{\mathrm{X}}$ (an alternative umlaut) with: {\"X}</li>
-     * <li>Replaces: \?{X} and \?X with {\?X}, where ? is a diacritic symbol</li>
-     * </ul>
-     *
-     * @param content The content.
-     * @return The content with unified diacritics.
-     */
-    private static String unifyDiacritics(String content) {
-        return content.replaceAll(
-                "\\$\\\\ddot\\{\\\\mathrm\\{([^\\}])\\}\\}\\$",
-                "{\\\"$1}").replaceAll(
-                "(\\\\[^\\-a-zA-Z])\\{?([a-zA-Z])\\}?",
-                "{$1$2}");
-    }
-
     /**
      * <p>
      * An author or editor may be and institution not a person. In that case the key generator builds very long keys,
@@ -1248,15 +1185,20 @@ private static String generateInstitutionKey(String content) {
             return "";
         }
 
+        Matcher matcher = INLINE_ABBREVIATION.matcher(content);
+        if (matcher.find()) {
+            return LatexToUnicodeAdapter.format(matcher.group());
+        }
+
         String result = content;
-        result = unifyDiacritics(result);
-        result = result.replaceAll("^\\{", "").replaceAll("}$", "");
-        Matcher matcher = ABBREVIATIONS.matcher(result);
-        if (matcher.matches()) {
-            return matcher.group(1);
+        try {
+            result = LatexToUnicodeAdapter.format(content);
+        } catch (IllegalArgumentException e) {
+            LOGGER.warn("{} could not be converted to unicode. This can result in an incorrect or missing institute citation key", content);
         }
+        // Special characters can't be allowed past this point because the citation key generator might replace them with multiple mixed-case characters
+        result = StringUtil.replaceSpecialCharacters(result);
 
-        result = removeDiacritics(result);
         String[] institutionNameTokens = result.split(",");
 
         // Key parts
@@ -1335,7 +1277,6 @@ private static String generateInstitutionKey(String content) {
      * institution keyword and has an uppercase first letter, except univ/tech key word.
      *
      * @param word to check
-     * @return
      */
     private static boolean noOtherInstitutionKeyWord(String word) {
         return !DEPARTMENTS.matcher(word).matches()

diff --git a/src/main/java/org/jabref/logic/formatter/casechanger/Word.java b/src/main/java/org/jabref/logic/formatter/casechanger/Word.java
@@ -1,19 +1,24 @@
 package org.jabref.logic.formatter.casechanger;
 
 import java.util.Arrays;
-import java.util.Collections;
 import java.util.HashSet;
 import java.util.Locale;
 import java.util.Objects;
 import java.util.Set;
+import java.util.stream.Collectors;
 
 /**
  * Represents a word in a title of a bibtex entry.
  * <p>
  * A word can have protected chars (enclosed in '{' '}') and may be a small (a, an, the, ...) word.
  */
 public final class Word {
+    /**
+     * Set containing common lowercase function words
+     */
     public static final Set<String> SMALLER_WORDS;
+    private final char[] chars;
+    private final boolean[] protectedChars;
 
     static {
         Set<String> smallerWords = new HashSet<>();
@@ -26,12 +31,11 @@ public final class Word {
         smallerWords.addAll(Arrays.asList("and", "but", "for", "nor", "or", "so", "yet"));
 
         // unmodifiable for thread safety
-        SMALLER_WORDS = Collections.unmodifiableSet(smallerWords);
+        SMALLER_WORDS = smallerWords.stream()
+                            .map(word -> word.toLowerCase(Locale.ROOT))
+                            .collect(Collectors.toUnmodifiableSet());
     }
 
-    private final char[] chars;
-    private final boolean[] protectedChars;
-
     public Word(char[] chars, boolean[] protectedChars) {
         this.chars = Objects.requireNonNull(chars);
         this.protectedChars = Objects.requireNonNull(protectedChars);
@@ -41,16 +45,21 @@ public Word(char[] chars, boolean[] protectedChars) {
         }
     }
 
+    /**
+     * Case-insensitive check against {@link Word#SMALLER_WORDS}. Checks for common function words.
+     */
+    public static boolean isSmallerWord(String word) {
+        return SMALLER_WORDS.contains(word.toLowerCase(Locale.ROOT));
+    }
+
     /**
      * Only change letters of the word that are unprotected to upper case.
      */
     public void toUpperCase() {
         for (int i = 0; i < chars.length; i++) {
-            if (protectedChars[i]) {
-                continue;
+            if (!protectedChars[i]) {
+                chars[i] = Character.toUpperCase(chars[i]);
             }
-
-            chars[i] = Character.toUpperCase(chars[i]);
         }
     }
 
@@ -59,24 +68,18 @@ public void toUpperCase() {
      */
     public void toLowerCase() {
         for (int i = 0; i < chars.length; i++) {
-            if (protectedChars[i]) {
-                continue;
+            if (!protectedChars[i]) {
+                chars[i] = Character.toLowerCase(chars[i]);
             }
-
-            chars[i] = Character.toLowerCase(chars[i]);
         }
     }
 
     public void toUpperFirst() {
         for (int i = 0; i < chars.length; i++) {
-            if (protectedChars[i]) {
-                continue;
-            }
-
-            if (i == 0) {
-                chars[i] = Character.toUpperCase(chars[i]);
-            } else {
-                chars[i] = Character.toLowerCase(chars[i]);
+            if (!protectedChars[i]) {
+                chars[i] = (i == 0) ?
+                        Character.toUpperCase(chars[i]) :
+                        Character.toLowerCase(chars[i]);
             }
         }
     }

diff --git a/src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java b/src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java
@@ -5,23 +5,56 @@
 import java.util.regex.Pattern;
 
 import com.github.tomtung.latex2unicode.LaTeX2Unicode;
+import fastparse.core.Parsed;
 
 /**
  * Adapter class for the latex2unicode lib. This is an alternative to our LatexToUnicode class
  */
 public class LatexToUnicodeAdapter {
 
-    private static Pattern underscoreMatcher = Pattern.compile("_(?!\\{)");
+    private static final Pattern UNDERSCORE_MATCHER = Pattern.compile("_(?!\\{)");
 
-    private static String replacementChar = "\uFFFD";
+    private static final String REPLACEMENT_CHAR = "\uFFFD";
 
-    private static Pattern underscorePlaceholderMatcher = Pattern.compile(replacementChar);
+    private static final Pattern UNDERSCORE_PLACEHOLDER_MATCHER = Pattern.compile(REPLACEMENT_CHAR);
 
+    /**
+     * Attempts to resolve all LaTeX in the String.
+     *
+     * @param inField a String containing LaTeX
+     * @return a String with LaTeX resolved into Unicode, or the original String if the LaTeX could not be parsed
+     */
     public static String format(String inField) {
         Objects.requireNonNull(inField);
 
-        String toFormat = underscoreMatcher.matcher(inField).replaceAll(replacementChar);
-        toFormat = Normalizer.normalize(LaTeX2Unicode.convert(toFormat), Normalizer.Form.NFC);
-        return underscorePlaceholderMatcher.matcher(toFormat).replaceAll("_");
+        try {
+            return parse(inField);
+        } catch (IllegalArgumentException ignored) {
+            return Normalizer.normalize(inField, Normalizer.Form.NFC);
+        }
+    }
+
+    /**
+     * Attempts to resolve all LaTeX in the String.
+     *
+     * @param inField a String containing LaTeX
+     * @return a String with LaTeX resolved into Unicode
+     * @throws IllegalArgumentException if the LaTeX could not be parsed
+     */
+    public static String parse(String inField) throws IllegalArgumentException {
+        Objects.requireNonNull(inField);
+        String toFormat = UNDERSCORE_MATCHER.matcher(inField).replaceAll(REPLACEMENT_CHAR);
+        try {
+            var parsingResult = LaTeX2Unicode.parse(toFormat);
+            if (parsingResult instanceof Parsed.Success) {
+                String text = parsingResult.get().value();
+                toFormat = Normalizer.normalize(text, Normalizer.Form.NFC);
+                return UNDERSCORE_PLACEHOLDER_MATCHER.matcher(toFormat).replaceAll("_");
+            } else {
+                throw new IllegalArgumentException("Parsing of latex failed.");
+            }
+        } catch (Throwable throwable) {
+            throw new IllegalArgumentException("An error occurred while attempting to parse latex.");
+        }
     }
 }