Merge pull request #829 from spencermountain/dev

Dev
spencermountain · Mar 29, 2021 · 652196e · 652196e
2 parents 543595e + bd4c84b
commit 652196e
Show file tree

Hide file tree

Showing 81 changed files with 2,213 additions and 1,247 deletions.
diff --git a/.eslintrc b/.eslintrc
@@ -6,12 +6,13 @@
     "node": true
   },
   "parserOptions": {
-    "ecmaVersion": 2017,
+    "ecmaVersion": 2018,
     "sourceType": "module",
     "ecmaFeatures": {}
   },
   "extends": [],
   "rules": {
+    "regexp/prefer-d": 0,
     "semi": ["warn", "never"],
     "indent": ["error", 2, { "SwitchCase": 1 }],
     "spaced-comment": 0,

diff --git a/README.md b/README.md
@@ -26,17 +26,36 @@
   <a href="https://bundlephobia.com/result?p=compromise">
     <img src="https://badge-size.herokuapp.com/spencermountain/compromise/master/builds/compromise.min.js" />
   </a>
-  <a href="https://spectrum.chat/nlp-compromise">
-    <img src="https://img.shields.io/badge/spectrum-chat-%23b14344" />
-  </a>
   </div>
 </div>
 
 <!-- spacer -->
-<img height="15px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
+<img height="85px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
+
+<div align="left">
+isn't it weird how we can <i>write text</i>, but not parse it?
+<br/>
+<ul>
+   <i>↬<sub>ᔐᖜ</sub>↬-</i> and how we can't get the information <i>back out</i>?⇬
+</ul>
+</div>
+<img height="55px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
+
+<div align="center">
+it's like we've agreed that
+<div>
+text is a dead-end.
+</div>
+<sub>and the knowledge in it</sub>
+<br/>
+<sub>should not really be used.</sub>
+</div>
+
+<!-- spacer -->
+<img height="45px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
 
 <div align="left">
-  <img height="30px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>compromise <a href="https://observablehq.com/@spencermountain/compromise-justification">tries its best</a>.
+  <img height="30px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>compromise <a href="https://observablehq.com/@spencermountain/compromise-justification">tries its best</a> to parse text.
 </div>
 
 <div align="left">
@@ -45,7 +64,13 @@
   <a href="https://docs.compromise.cool/compromise-filesize">small,
   <a href="https://docs.compromise.cool/compromise-performance">quick</a>,
   and often <i><a href="https://docs.compromise.cool/compromise-accuracy">good-enough</a></i>.
-</div>
+  <br/>
+   <img height="30px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> it is not as smart as you'd think.
+   <br/>
+   <!-- spacer -->
+<!-- <img height="45px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
+   it is though - very open-ended, hackable, and open to engineering. -->
+   </div>
 
 <!-- spacer -->
 <img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
@@ -96,7 +121,7 @@ doc.text()
 
 ### .nouns():
 
-transform nouns to plural and possessive forms:
+play between plural, singular and possessive forms:
 
 ```js
 let doc = nlp('the purple dinosaur')
@@ -134,7 +159,7 @@ doc.text()
 
 ### .topics():
 
-grab the big subjects:
+names/places/orgs, tldr:
 
 ```js
 let doc = nlp(buddyHolly)
@@ -163,7 +188,7 @@ doc.topics().json()
 
 ### .contractions():
 
-handle implicit words:
+handle implicit terms:
 
 ```js
 let doc = nlp("we're not gonna take it, no we ain't gonna take it.")
@@ -792,7 +817,7 @@ or if you don't care about POS-tagging, you can use the tokenize-only build: (90
 
 - **inter-sentence match:**
   By default, sentences are the top-level abstraction.
-  Inter-sentence, or multi-sentence matches aren't supported:
+  Inter-sentence, or multi-sentence matches aren't supported without <a href="https://github.com/spencermountain/compromise/tree/master/plugins/paragraphs">a plugin</a>:
   <code>nlp("that's it. Back to Winnipeg!").has('it back')//false</code>
 
 - **nested match syntax:**

diff --git a/builds/compromise-tokenize.js b/builds/compromise-tokenize.js
diff --git a/builds/compromise.js b/builds/compromise.js
@@ -1,4 +1,4 @@
-/* compromise 13.10.4 MIT */
+/* compromise 13.10.5 MIT */
 (function (global, factory) {
   typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() :
   typeof define === 'function' && define.amd ? define(factory) :
@@ -199,7 +199,7 @@
   var periodAcronym = /([A-Z]\.)+[A-Z]?,?$/;
   var oneLetterAcronym$1 = /^[A-Z]\.,?$/;
   var noPeriodAcronym = /[A-Z]{2,}('s|,)?$/;
-  var lowerCaseAcronym = /([a-z]\.){1,}[a-z]\.?$/;
+  var lowerCaseAcronym = /([a-z]\.)+[a-z]\.?$/;
 
   var isAcronym$2 = function isAcronym(str) {
     //like N.D.A
@@ -302,8 +302,8 @@
   //all punctuation marks, from https://en.wikipedia.org/wiki/Punctuation
   //we have slightly different rules for start/end - like #hashtags.
 
-  var startings = /^[ \n\t\.’'\[\](){}⟨⟩:,،、‒–—―…!.‹›«»‐\-?‘’;\/⁄·&*•^†‡°¡¿※№÷×ºª%‰+−=‱¶′″‴§~|‖¦©℗®℠™¤₳฿\u0022|\uFF02|\u0027|\u201C|\u2018|\u201F|\u201B|\u201E|\u2E42|\u201A|\u00AB|\u2039|\u2035|\u2036|\u2037|\u301D|\u0060|\u301F]+/;
-  var endings = /[ \n\t\.’'\[\](){}⟨⟩:,،、‒–—―…!.‹›«»‐\-?‘’;\/⁄·&*@•^†‡°¡¿※#№÷×ºª‰+−=‱¶′″‴§~|‖¦©℗®℠™¤₳฿\u0022|\uFF02|\u0027|\u201D|\u2019|\u201D|\u2019|\u201D|\u201D|\u2019|\u00BB|\u203A|\u2032|\u2033|\u2034|\u301E|\u00B4|\u301E]+$/; //money = ₵¢₡₢$₫₯֏₠€ƒ₣₲₴₭₺₾ℳ₥₦₧₱₰£៛₽₹₨₪৳₸₮₩¥
+  var startings = /^[ \n\t\.\[\](){}⟨⟩:,،、‒–—―…!‹›«»‐\-?‘’;\/⁄·&*•^†‡°¡¿※№÷×ºª%‰+−=‱¶′″‴§~\|‖¦©℗®℠™¤₳฿\u0022\uFF02\u0027\u201C\u201F\u201B\u201E\u2E42\u201A\u2035\u2036\u2037\u301D\u0060\u301F]+/;
+  var endings = /[ \n\t\.'\[\](){}⟨⟩:,،、‒–—―…!‹›«»‐\-?‘’;\/⁄·&*@•^†‡°¡¿※#№÷×ºª‰+−=‱¶′″‴§~\|‖¦©℗®℠™¤₳฿\u0022\uFF02\u201D\u00B4\u301E]+$/; //money = ₵¢₡₢$₫₯֏₠€ƒ₣₲₴₭₺₾ℳ₥₦₧₱₰£៛₽₹₨₪৳₸₮₩¥
 
   var hasSlash = /\//;
   var hasApostrophe = /['’]/;
@@ -2671,7 +2671,7 @@
   */
   var hasMinMax = /\{([0-9]+,?[0-9]*)\}/;
   var andSign = /&&/;
-  var captureName = new RegExp(/^< *?(\S+) *?>/);
+  var captureName = new RegExp(/^<\s*?(\S+)\s*?>/);
 
   var titleCase$2 = function titleCase(str) {
     return str.charAt(0).toUpperCase() + str.substr(1);
@@ -3504,7 +3504,7 @@
   var hasEllipse = /(?:\u2026|\.{2,}) *$/;
   var newLine = /((?:\r?\n|\r)+)/; // Match different new-line formats
 
-  var hasLetter = /[a-z0-9\u00C0-\u00FF\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]/i;
+  var hasLetter = /[a-z0-9\u00C0-\u00FF\u00a9\u00ae\u2000-\u3300\ud000-\udfff]/i;
   var startWhitespace = /^\s+/; // Start with a regex:
 
   var naiive_split = function naiive_split(text) {
@@ -3866,7 +3866,7 @@
 
   var fromJSON_1 = fromJSON;
 
-  var _version = '13.10.4';
+  var _version = '13.10.5';
 
   var entity = ['Person', 'Place', 'Organization'];
   var nouns$1 = {
@@ -5780,7 +5780,7 @@
       }
     }],
     y: [{
-      reg: /([i|f|rr])y$/i,
+      reg: /(i|f|rr)y$/i,
       repl: {
         pr: '$1ies',
         pa: '$1ied',
@@ -9478,8 +9478,9 @@
   //fraction
   [/^[0-9]{1,4}\/[0-9]{1,4}(st|nd|rd|th)?s?$/, ['Fraction', 'NumericValue']], //3/2ths
   //range
-  [/^[0-9.]{1,2}[-–][0-9]{1,2}$/, ['Value', 'NumberRange']], //7-8
-  [/^[0-9.]{1,3}(st|nd|rd|th)?[-–][0-9\.]{1,3}(st|nd|rd|th)?$/, 'NumberRange'], //5-7
+  [/^[0-9.]{1,3}[a-z]{0,2}[-–—][0-9]{1,3}[a-z]{0,2}$/, ['Value', 'NumberRange']], //7th-8th
+  //time-range
+  [/^[0-9][0-9]?(:[0-9][0-9])?(am|pm)?[-–—][0-9][0-9]?(:[0-9][0-9])?(am|pm)?$/, ['Time', 'NumberRange']], //7pm-8:30
   //with unit
   [/^[0-9.]+([a-z]{1,4})$/, 'Value'] //like 5tbsp
   //ordinal
@@ -10657,7 +10658,8 @@
 
   var _05PerfectTense = checkPerfect;
 
-  var isRange = /^([0-9]{1,3}(?:st|nd|rd|th)?)[-–—]([0-9]{1,3}(?:st|nd|rd|th)?)$/i; //split '2-4' into '2 to 4'
+  var isRange = /^([0-9.]{1,3}[a-z]{0,2})[-–—]([0-9]{1,3}[a-z]{0,2})$/i;
+  var timeRange = /^([0-9][0-9]?(:[0-9][0-9])?(am|pm)?)[-–—]([0-9][0-9]?(:[0-9][0-9])?(am|pm)?)$/i; //split '2-4' into '2 to 4'
 
   var checkRange = function checkRange(term) {
     if (term.tags.PhoneNumber === true) {
@@ -10668,6 +10670,12 @@
 
     if (parts !== null) {
       return [parts[1], 'to', parts[2]];
+    } else {
+      parts = term.text.match(timeRange);
+
+      if (parts !== null) {
+        return [parts[1], 'to', parts[4]];
+      }
     }
 
     return null;
@@ -10719,6 +10727,7 @@
 
   var isNumber = /^[0-9]+$/;
   var isOrdinal = /^[0-9]+(st|nd|rd|th)$/;
+  var isTime = /^[0-9:]+(am|pm)$/;
 
   var createPhrase = function createPhrase(found, doc) {
     //create phrase from ['would', 'not']
@@ -10739,6 +10748,8 @@
         t.tag('Cardinal', 'num-range', doc.world);
       } else if (isOrdinal.test(t.implicit)) {
         t.tag('Ordinal', 'ord-range', doc.world);
+      } else if (isTime.test(t.implicit)) {
+        t.tag('Time', 'time-range', doc.world);
       } else if (Object.keys(t.tags).length === 0) {
         t.tags.Noun = true; // if no tag, give it a noun
       }

diff --git a/builds/compromise.min.js b/builds/compromise.min.js
diff --git a/builds/compromise.mjs b/builds/compromise.mjs
@@ -1,4 +1,4 @@
-/* compromise 13.10.4 MIT */
+/* compromise 13.10.5 MIT */
 function _typeof(obj) {
   "@babel/helpers - typeof";
 
@@ -193,7 +193,7 @@ var unicode_1 = killUnicode; // console.log(killUnicode('bjŏȒk—Ɏó'));
 var periodAcronym = /([A-Z]\.)+[A-Z]?,?$/;
 var oneLetterAcronym$1 = /^[A-Z]\.,?$/;
 var noPeriodAcronym = /[A-Z]{2,}('s|,)?$/;
-var lowerCaseAcronym = /([a-z]\.){1,}[a-z]\.?$/;
+var lowerCaseAcronym = /([a-z]\.)+[a-z]\.?$/;
 
 var isAcronym$2 = function isAcronym(str) {
   //like N.D.A
@@ -296,8 +296,8 @@ var reduce = reduced;
 //all punctuation marks, from https://en.wikipedia.org/wiki/Punctuation
 //we have slightly different rules for start/end - like #hashtags.
 
-var startings = /^[ \n\t\.’'\[\](){}⟨⟩:,،、‒–—―…!.‹›«»‐\-?‘’;\/⁄·&*•^†‡°¡¿※№÷×ºª%‰+−=‱¶′″‴§~|‖¦©℗®℠™¤₳฿\u0022|\uFF02|\u0027|\u201C|\u2018|\u201F|\u201B|\u201E|\u2E42|\u201A|\u00AB|\u2039|\u2035|\u2036|\u2037|\u301D|\u0060|\u301F]+/;
-var endings = /[ \n\t\.’'\[\](){}⟨⟩:,،、‒–—―…!.‹›«»‐\-?‘’;\/⁄·&*@•^†‡°¡¿※#№÷×ºª‰+−=‱¶′″‴§~|‖¦©℗®℠™¤₳฿\u0022|\uFF02|\u0027|\u201D|\u2019|\u201D|\u2019|\u201D|\u201D|\u2019|\u00BB|\u203A|\u2032|\u2033|\u2034|\u301E|\u00B4|\u301E]+$/; //money = ₵¢₡₢$₫₯֏₠€ƒ₣₲₴₭₺₾ℳ₥₦₧₱₰£៛₽₹₨₪৳₸₮₩¥
+var startings = /^[ \n\t\.\[\](){}⟨⟩:,،、‒–—―…!‹›«»‐\-?‘’;\/⁄·&*•^†‡°¡¿※№÷×ºª%‰+−=‱¶′″‴§~\|‖¦©℗®℠™¤₳฿\u0022\uFF02\u0027\u201C\u201F\u201B\u201E\u2E42\u201A\u2035\u2036\u2037\u301D\u0060\u301F]+/;
+var endings = /[ \n\t\.'\[\](){}⟨⟩:,،、‒–—―…!‹›«»‐\-?‘’;\/⁄·&*@•^†‡°¡¿※#№÷×ºª‰+−=‱¶′″‴§~\|‖¦©℗®℠™¤₳฿\u0022\uFF02\u201D\u00B4\u301E]+$/; //money = ₵¢₡₢$₫₯֏₠€ƒ₣₲₴₭₺₾ℳ₥₦₧₱₰£៛₽₹₨₪৳₸₮₩¥
 
 var hasSlash = /\//;
 var hasApostrophe = /['’]/;
@@ -2665,7 +2665,7 @@ var _01ParseBlocks = parseBlocks; // console.log('(one two) (upto) [<snooze_to>#
 */
 var hasMinMax = /\{([0-9]+,?[0-9]*)\}/;
 var andSign = /&&/;
-var captureName = new RegExp(/^< *?(\S+) *?>/);
+var captureName = new RegExp(/^<\s*?(\S+)\s*?>/);
 
 var titleCase$2 = function titleCase(str) {
   return str.charAt(0).toUpperCase() + str.substr(1);
@@ -3498,7 +3498,7 @@ var isAcronym$1 = /[ .][A-Z]\.? *$/i;
 var hasEllipse = /(?:\u2026|\.{2,}) *$/;
 var newLine = /((?:\r?\n|\r)+)/; // Match different new-line formats
 
-var hasLetter = /[a-z0-9\u00C0-\u00FF\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]/i;
+var hasLetter = /[a-z0-9\u00C0-\u00FF\u00a9\u00ae\u2000-\u3300\ud000-\udfff]/i;
 var startWhitespace = /^\s+/; // Start with a regex:
 
 var naiive_split = function naiive_split(text) {
@@ -3860,7 +3860,7 @@ var fromJSON = function fromJSON(json, world) {
 
 var fromJSON_1 = fromJSON;
 
-var _version = '13.10.4';
+var _version = '13.10.5';
 
 var entity = ['Person', 'Place', 'Organization'];
 var nouns$1 = {
@@ -5774,7 +5774,7 @@ var endsWith$1 = {
     }
   }],
   y: [{
-    reg: /([i|f|rr])y$/i,
+    reg: /(i|f|rr)y$/i,
     repl: {
       pr: '$1ies',
       pa: '$1ied',
@@ -9472,8 +9472,9 @@ var startsWith = [//web tags
 //fraction
 [/^[0-9]{1,4}\/[0-9]{1,4}(st|nd|rd|th)?s?$/, ['Fraction', 'NumericValue']], //3/2ths
 //range
-[/^[0-9.]{1,2}[-–][0-9]{1,2}$/, ['Value', 'NumberRange']], //7-8
-[/^[0-9.]{1,3}(st|nd|rd|th)?[-–][0-9\.]{1,3}(st|nd|rd|th)?$/, 'NumberRange'], //5-7
+[/^[0-9.]{1,3}[a-z]{0,2}[-–—][0-9]{1,3}[a-z]{0,2}$/, ['Value', 'NumberRange']], //7th-8th
+//time-range
+[/^[0-9][0-9]?(:[0-9][0-9])?(am|pm)?[-–—][0-9][0-9]?(:[0-9][0-9])?(am|pm)?$/, ['Time', 'NumberRange']], //7pm-8:30
 //with unit
 [/^[0-9.]+([a-z]{1,4})$/, 'Value'] //like 5tbsp
 //ordinal
@@ -10651,7 +10652,8 @@ var checkPerfect = function checkPerfect(term, phrase) {
 
 var _05PerfectTense = checkPerfect;
 
-var isRange = /^([0-9]{1,3}(?:st|nd|rd|th)?)[-–—]([0-9]{1,3}(?:st|nd|rd|th)?)$/i; //split '2-4' into '2 to 4'
+var isRange = /^([0-9.]{1,3}[a-z]{0,2})[-–—]([0-9]{1,3}[a-z]{0,2})$/i;
+var timeRange = /^([0-9][0-9]?(:[0-9][0-9])?(am|pm)?)[-–—]([0-9][0-9]?(:[0-9][0-9])?(am|pm)?)$/i; //split '2-4' into '2 to 4'
 
 var checkRange = function checkRange(term) {
   if (term.tags.PhoneNumber === true) {
@@ -10662,6 +10664,12 @@ var checkRange = function checkRange(term) {
 
   if (parts !== null) {
     return [parts[1], 'to', parts[2]];
+  } else {
+    parts = term.text.match(timeRange);
+
+    if (parts !== null) {
+      return [parts[1], 'to', parts[4]];
+    }
   }
 
   return null;
@@ -10713,6 +10721,7 @@ var _07French = checkFrench;
 
 var isNumber = /^[0-9]+$/;
 var isOrdinal = /^[0-9]+(st|nd|rd|th)$/;
+var isTime = /^[0-9:]+(am|pm)$/;
 
 var createPhrase = function createPhrase(found, doc) {
   //create phrase from ['would', 'not']
@@ -10733,6 +10742,8 @@ var createPhrase = function createPhrase(found, doc) {
       t.tag('Cardinal', 'num-range', doc.world);
     } else if (isOrdinal.test(t.implicit)) {
       t.tag('Ordinal', 'ord-range', doc.world);
+    } else if (isTime.test(t.implicit)) {
+      t.tag('Time', 'time-range', doc.world);
     } else if (Object.keys(t.tags).length === 0) {
       t.tags.Noun = true; // if no tag, give it a noun
     }

diff --git a/changelog.md b/changelog.md
@@ -10,6 +10,11 @@ While all _Major_ releases should be reviewed, our only two _large_ releases are
 
 <!-- #### [Unreleased] 
 -->
+#### 13.10.5  [March 2021]
+- **[new]** - support Time-range like '3pm-4pm'
+- **[change]** - cleanup some unicode regexes
+*plugin-releases*:  dates
+
 #### 13.10.4  [March 2021]
 - **[fix]** - match syntax tokenization fix
 - **[change]** - improved performance monitoring