linkedin · pgalbraith · Jul 3, 2017 · Jul 3, 2017 · Jul 3, 2017 · Sep 16, 2018
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,9 @@
+root = true
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+charset = utf-8
+indent_style = space
+indent_size = 2
+tab_size = 8
diff --git a/gradle.properties b/gradle.properties
@@ -1,5 +1,5 @@
 #Version
-version=0.1.17
+version=0.1.20
 
 #long-running Gradle process speeds up local builds
 #to stop the daemon run 'ligradle --stop'

diff --git a/url-detector/pom.xml b/url-detector/pom.xml
@@ -2,14 +2,14 @@
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
   <modelVersion>4.0.0</modelVersion>
 
-  <groupId>com.linkedin.urls</groupId>
+  <groupId>io.github.pgalbraith</groupId>
   <artifactId>url-detector</artifactId>
-  <version>0.1.17</version>
+  <version>0.1.20</version>
   <packaging>jar</packaging>
 
-  <name>com.linkedin.urls:url-detector</name>
+  <name>io.github.pgalbraith:url-detector</name>
   <description>A Java library to detect and normalize URLs in text</description>
-  <url>https://github.com/linkedin/URL-Detector</url>
+  <url>https://github.com/pgalbraith/URL-Detector</url>
 
   <licenses>
     <license>
@@ -19,11 +19,15 @@
   </licenses>
 
   <scm>
-    <connection>scm:git:git://github.com/linkedin/URL-Detector.git</connection>
-    <developerConnection>scm:git:ssh://github.com:linkedin/URL-Detector.git</developerConnection>
-    <url>https://github.com/linkedin/URL-Detector/tree/master</url>
+    <connection>scm:git:git://github.com/pgalbraith/URL-Detector.git</connection>
+    <developerConnection>scm:git:ssh://github.com:pgalbraith/URL-Detector.git</developerConnection>
+    <url>https://github.com/pgalbraith/URL-Detector/tree/master</url>
   </scm>
-
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+  </properties>
+
   <dependencies>
     <dependency>
         <groupId>org.testng</groupId>
@@ -94,13 +98,15 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-deploy-plugin</artifactId>
+        <version>2.7</version>
         <configuration>
           <skip>true</skip>
         </configuration>
       </plugin>
       <plugin>
         <groupId>org.sonatype.plugins</groupId>
         <artifactId>nexus-staging-maven-plugin</artifactId>
+        <version>1.6.8</version>
         <executions>
           <execution>
             <id>default-deploy</id>

diff --git a/url-detector/src/main/java/com/linkedin/urls/detection/DomainNameReader.java b/url-detector/src/main/java/com/linkedin/urls/detection/DomainNameReader.java
@@ -102,7 +102,11 @@ public enum ReaderNextState {
     /**
      * Finished reading, next step should be to read the query string.
      */
-    ReadQueryString
+    ReadQueryString,
+    /**
+     * This was actually not a domain at all.
+     */
+    ReadUserPass
   }
 
   /**
@@ -332,6 +336,10 @@ public ReaderNextState readDomainName() {
       } else if (curr == '#') {
         //continue by reading the fragment
         return checkDomainNameValid(ReaderNextState.ReadFragment, curr);
+      } else if (curr == '@') {
+        //this may not have been a domain after all, but rather a username/password instead
+        _reader.goBack();
+        return ReaderNextState.ReadUserPass;
       } else if (CharUtils.isDot(curr)
           || (curr == '%' && _reader.canReadChars(2) && _reader.peek(2).equalsIgnoreCase(HEX_ENCODED_DOT))) {
         //if the current character is a dot or a urlEncodedDot

diff --git a/url-detector/src/main/java/com/linkedin/urls/detection/InputTextReader.java b/url-detector/src/main/java/com/linkedin/urls/detection/InputTextReader.java
@@ -14,11 +14,6 @@
  */
 public class InputTextReader {
 
-  /**
-   * The number of times something can be backtracked is this multiplier times the length of the string.
-   */
-  protected static final int MAX_BACKTRACK_MULTIPLIER = 10;
-
   /**
    * The content to read.
    */
@@ -29,16 +24,6 @@ public class InputTextReader {
    */
   private int _index = 0;
 
-  /**
-   * Contains the amount of characters that were backtracked. This is used for performance analysis.
-   */
-  private int _backtracked = 0;
-
-  /**
-   * When detecting for exceeding the backtrack limit, make sure the text is at least 20 characters.
-   */
-  private final static int MINIMUM_BACKTRACK_LENGTH = 20;
-
   /**
    * Creates a new instance of the InputTextReader using the content to read.
    * @param content The content to read.
@@ -102,47 +87,18 @@ public int getPosition() {
     return _index;
   }
 
-  /**
-   * Gets the total number of characters that were backtracked when reading.
-   */
-  public int getBacktrackedCount() {
-    return _backtracked;
-  }
-
   /**
    * Moves the index to the specified position.
    * @param position The position to set the index to.
    */
   public void seek(int position) {
-    int backtrackLength = Math.max(_index - position, 0);
-    _backtracked += backtrackLength;
     _index = position;
-    checkBacktrackLoop(backtrackLength);
   }
 
   /**
    * Goes back a single character.
    */
   public void goBack() {
-    _backtracked++;
     _index--;
-    checkBacktrackLoop(1);
-  }
-
-  private void checkBacktrackLoop(int backtrackLength) {
-    if (_backtracked > (_content.length * MAX_BACKTRACK_MULTIPLIER)) {
-      if (backtrackLength < MINIMUM_BACKTRACK_LENGTH) {
-        backtrackLength = MINIMUM_BACKTRACK_LENGTH;
-      }
-
-      int start = Math.max(_index, 0);
-      if (start + backtrackLength > _content.length) {
-        backtrackLength = _content.length - start;
-      }
-
-      String badText = new String(_content, start, backtrackLength);
-      throw new NegativeArraySizeException("Backtracked max amount of characters. Endless loop detected. Bad Text: '"
-          + badText + "'");
-    }
   }
 }
diff --git a/url-detector/src/main/java/com/linkedin/urls/detection/UrlDetector.java b/url-detector/src/main/java/com/linkedin/urls/detection/UrlDetector.java
@@ -125,15 +125,6 @@ public UrlDetector(String content, UrlDetectorOptions options) {
     _options = options;
   }
 
-  /**
-   * Gets the number of characters that were backtracked while reading the input. This is useful for performance
-   * measurement.
-   * @return The count of characters that were backtracked while reading.
-   */
-  public int getBacktracked() {
-    return _reader.getBacktrackedCount();
-  }
-
   /**
    * Detects the urls and returns a list of detected url strings.
    * @return A list with detected urls.
@@ -154,13 +145,14 @@ private void readDefault() {
     while (!_reader.eof()) {
       //read the next char to process.
       char curr = _reader.read();
-
       switch (curr) {
         case ' ':
           //space was found, check if it's a valid single level domain.
           if (_options.hasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN) && _buffer.length() > 0 && _hasScheme) {
             _reader.goBack();
-            readDomainName(_buffer.substring(length));
+            if (!readDomainName(_buffer.substring(length))) {
+              readEnd(ReadEndState.InvalidUrl);
+            };
           }
           _buffer.append(curr);
           readEnd(ReadEndState.InvalidUrl);
@@ -178,7 +170,9 @@ private void readDefault() {
               _buffer.append(_reader.read());
               _buffer.append(_reader.read());
 
-              readDomainName(_buffer.substring(length));
+              if (!readDomainName(_buffer.substring(length))) {
+                readEnd(ReadEndState.InvalidUrl);
+              }
               length = 0;
             }
           }
@@ -188,14 +182,18 @@ private void readDefault() {
         case '\uFF61':
         case '.': //"." was found, read the domain name using the start from length.
           _buffer.append(curr);
-          readDomainName(_buffer.substring(length));
+          if (!readDomainName(_buffer.substring(length))) {
+            readEnd(ReadEndState.InvalidUrl);
+          }
           length = 0;
           break;
         case '@': //Check the domain name after a username
           if (_buffer.length() > 0) {
             _currentUrlMarker.setIndex(UrlPart.USERNAME_PASSWORD, length);
             _buffer.append(curr);
-            readDomainName(null);
+            if (!readDomainName(null)) {
+              readEnd(ReadEndState.InvalidUrl);
+            }
             length = 0;
           }
           break;
@@ -218,6 +216,7 @@ private void readDefault() {
 
           if (!readDomainName(_buffer.substring(length))) {
             //if we didn't find an ipv6 address, then check inside the brackets for urls
+            readEnd(ReadEndState.InvalidUrl);
             _reader.seek(beginning);
             _dontMatchIpv6 = true;
           }
@@ -235,7 +234,9 @@ private void readDefault() {
 
             //unread this "/" and continue to check the domain name starting from the beginning of the domain
             _reader.goBack();
-            readDomainName(_buffer.substring(length));
+            if (!readDomainName(_buffer.substring(length))) {
+              readEnd(ReadEndState.InvalidUrl);
+            }
             length = 0;
           } else {
 
@@ -265,7 +266,9 @@ private void readDefault() {
       }
     }
     if (_options.hasFlag(UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN) && _buffer.length() > 0 && _hasScheme) {
-      readDomainName(_buffer.substring(length));
+      if (!readDomainName(_buffer.substring(length))) {
+        readEnd(ReadEndState.InvalidUrl);
+      }
     }
   }
 
@@ -277,10 +280,16 @@ private void readDefault() {
   private int processColon(int length) {
     if (_hasScheme) {
       //read it as username/password if it has scheme
-      if (!readUserPass(length) && _buffer.length() > 0) {
+      if (!readUserPass(length)) {
         //unread the ":" so that the domain reader can process it
         _reader.goBack();
-        _buffer.delete(_buffer.length() - 1, _buffer.length());
+
+        // Check buffer length before clearing it; set length to 0 if buffer is empty
+        if (_buffer.length() > 0) {
+          _buffer.delete(_buffer.length() - 1, _buffer.length());
+        } else {
+          length = 0;
+        }
 
         int backtrackOnFail = _reader.getPosition() - _buffer.length() + length;
         if (!readDomainName(_buffer.substring(length))) {
@@ -289,6 +298,8 @@ private int processColon(int length) {
           readEnd(ReadEndState.InvalidUrl);
         }
         length = 0;
+      } else {
+    	length = 0;
       }
     } else if (readScheme() && _buffer.length() > 0) {
       _hasScheme = true;
@@ -297,7 +308,9 @@ private int processColon(int length) {
         && _reader.canReadChars(1)) { //takes care of case like hi:
       _reader.goBack(); //unread the ":" so readDomainName can take care of the port
       _buffer.delete(_buffer.length() - 1, _buffer.length());
-      readDomainName(_buffer.toString());
+      if (!readDomainName(_buffer.toString())) {
+        readEnd(ReadEndState.InvalidUrl);
+      }
     } else {
       readEnd(ReadEndState.InvalidUrl);
       length = 0;
@@ -470,10 +483,9 @@ private boolean readScheme() {
    * @return True if a valid username and password was found.
    */
   private boolean readUserPass(int beginningOfUsername) {
-
     //The start of where we are.
     int start = _buffer.length();
-
+    
     //keep looping until "done"
     boolean done = false;
 
@@ -547,8 +559,12 @@ public void addCharacter(char character) {
         return readPort();
       case ReadQueryString:
         return readQueryString();
+      case ReadUserPass:
+        int host = _currentUrlMarker.indexOf(UrlPart.HOST);
+        _currentUrlMarker.unsetIndex(UrlPart.HOST);
+        return readUserPass(host);
       default:
-        return readEnd(ReadEndState.InvalidUrl);
+        return false;
     }
   }
 

diff --git a/url-detector/src/test/java/com/linkedin/urls/TestUrl.java b/url-detector/src/test/java/com/linkedin/urls/TestUrl.java
@@ -28,7 +28,8 @@ private Object[][] getUsernamePasswordUrls() {
         {"@www.google.com", "www.google.com", "/", "", ""},
         {"lalal:@www.gogo.com", "www.gogo.com", "/", "lalal", ""},
         {"nono:boo@[::1]", "[::1]", "/", "nono", "boo"},
-        {"nono:boo@yahoo.com/@1234", "yahoo.com", "/@1234", "nono", "boo"}
+        {"nono:boo@yahoo.com/@1234", "yahoo.com", "/@1234", "nono", "boo"},
+        {"big.big.boss@google.com", "google.com", "/", "big.big.boss", ""}
     };
   }
 

diff --git a/url-detector/src/test/java/com/linkedin/urls/detection/TestInputTextReader.java b/url-detector/src/test/java/com/linkedin/urls/detection/TestInputTextReader.java
@@ -59,13 +59,4 @@ public void testSeek() {
     reader.seek(1);
     Assert.assertEquals(reader.read(), CONTENT.charAt(1));
   }
-
-  @Test(expectedExceptions = NegativeArraySizeException.class, expectedExceptionsMessageRegExp = ".*" + CONTENT + ".*")
-  public void testEndlessLoopDetection() {
-    InputTextReader reader = new InputTextReader(CONTENT);
-    for (int i = 0; i < InputTextReader.MAX_BACKTRACK_MULTIPLIER + 1; i++) {
-      reader.seek(CONTENT.length());
-      reader.seek(0);
-    }
-  }
 }
diff --git a/url-detector/src/test/java/com/linkedin/urls/detection/TestUriDetection.java b/url-detector/src/test/java/com/linkedin/urls/detection/TestUriDetection.java
@@ -648,6 +648,42 @@ public void testIpv6ZoneIndicesWithUrlEncodedDots(String address, String zoneInd
   public void testBacktrackInvalidUsernamePassword() {
     runTest("http://hello:asdf.com", UrlDetectorOptions.Default, "asdf.com");
   }
+
+  /*
+   * https://github.com/linkedin/URL-Detector/issues/12
+   */
+  @Test
+  public void testIssue12() {
+    runTest("http://user:pass@host.com host.com", UrlDetectorOptions.Default, "http://user:pass@host.com", "host.com");
+  }
+
+  /*
+   * https://github.com/linkedin/URL-Detector/issues/13
+   */
+  @Test
+  public void testIssue13() {
+    runTest("user@github.io/page", UrlDetectorOptions.Default, "user@github.io/page");
+    runTest("name@gmail.com", UrlDetectorOptions.Default, "name@gmail.com");
+    runTest("name.lastname@gmail.com", UrlDetectorOptions.Default, "name.lastname@gmail.com");
+    runTest("gmail.com@gmail.com", UrlDetectorOptions.Default, "gmail.com@gmail.com");
+    runTest("first.middle.reallyreallyreallyreallyreallyreallyreallyreallyreallyreallylonglastname@gmail.com", UrlDetectorOptions.Default, "first.middle.reallyreallyreallyreallyreallyreallyreallyreallyreallyreallylonglastname@gmail.com");
+  }
+
+  /*
+   * https://github.com/linkedin/URL-Detector/issues/15
+   */
+  @Test
+  public void testIssue15() {
+    runTest(".............:::::::::::;;;;;;;;;;;;;;;::...............................................:::::::::::::::::::::::::::::....................", UrlDetectorOptions.Default);
+  }
+
+  /*
+   * https://github.com/linkedin/URL-Detector/issues/16
+   */
+  @Test
+  public void testIssue16() {
+    runTest("://VIVE MARINE LE PEN//:@.", UrlDetectorOptions.Default);
+  }
 
   private void runTest(String text, UrlDetectorOptions options, String... expected) {
     //do the detection