Skip to content

Commit

Permalink
Merge pull request #829 from spencermountain/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
spencermountain authored Mar 29, 2021
2 parents 543595e + bd4c84b commit 652196e
Show file tree
Hide file tree
Showing 81 changed files with 2,213 additions and 1,247 deletions.
3 changes: 2 additions & 1 deletion .eslintrc
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
"node": true
},
"parserOptions": {
"ecmaVersion": 2017,
"ecmaVersion": 2018,
"sourceType": "module",
"ecmaFeatures": {}
},
"extends": [],
"rules": {
"regexp/prefer-d": 0,
"semi": ["warn", "never"],
"indent": ["error", 2, { "SwitchCase": 1 }],
"spaced-comment": 0,
Expand Down
45 changes: 35 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,36 @@
<a href="https://bundlephobia.com/result?p=compromise">
<img src="https://badge-size.herokuapp.com/spencermountain/compromise/master/builds/compromise.min.js" />
</a>
<a href="https://spectrum.chat/nlp-compromise">
<img src="https://img.shields.io/badge/spectrum-chat-%23b14344" />
</a>
</div>
</div>

<!-- spacer -->
<img height="15px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
<img height="85px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>

<div align="left">
isn't it weird how we can <i>write text</i>, but not parse it?
<br/>
<ul>
<i>↬<sub>ᔐᖜ</sub>↬-</i> and how we can't get the information <i>back out</i>?⇬
</ul>
</div>
<img height="55px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>

<div align="center">
it's like we've agreed that
<div>
text is a dead-end.
</div>
<sub>and the knowledge in it</sub>
<br/>
<sub>should not really be used.</sub>
</div>

<!-- spacer -->
<img height="45px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>

<div align="left">
<img height="30px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>compromise <a href="https://observablehq.com/@spencermountain/compromise-justification">tries its best</a>.
<img height="30px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>compromise <a href="https://observablehq.com/@spencermountain/compromise-justification">tries its best</a> to parse text.
</div>

<div align="left">
Expand All @@ -45,7 +64,13 @@
<a href="https://docs.compromise.cool/compromise-filesize">small,
<a href="https://docs.compromise.cool/compromise-performance">quick</a>,
and often <i><a href="https://docs.compromise.cool/compromise-accuracy">good-enough</a></i>.
</div>
<br/>
<img height="30px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> it is not as smart as you'd think.
<br/>
<!-- spacer -->
<!-- <img height="45px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
it is though - very open-ended, hackable, and open to engineering. -->
</div>

<!-- spacer -->
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
Expand Down Expand Up @@ -96,7 +121,7 @@ doc.text()

### .nouns():

transform nouns to plural and possessive forms:
play between plural, singular and possessive forms:

```js
let doc = nlp('the purple dinosaur')
Expand Down Expand Up @@ -134,7 +159,7 @@ doc.text()

### .topics():

grab the big subjects:
names/places/orgs, tldr:

```js
let doc = nlp(buddyHolly)
Expand Down Expand Up @@ -163,7 +188,7 @@ doc.topics().json()

### .contractions():

handle implicit words:
handle implicit terms:

```js
let doc = nlp("we're not gonna take it, no we ain't gonna take it.")
Expand Down Expand Up @@ -792,7 +817,7 @@ or if you don't care about POS-tagging, you can use the tokenize-only build: (90

- **inter-sentence match:**
By default, sentences are the top-level abstraction.
Inter-sentence, or multi-sentence matches aren't supported:
Inter-sentence, or multi-sentence matches aren't supported without <a href="https://github.com/spencermountain/compromise/tree/master/plugins/paragraphs">a plugin</a>:
<code>nlp("that's it. Back to Winnipeg!").has('it back')//false</code>

- **nested match syntax:**
Expand Down
2 changes: 1 addition & 1 deletion builds/compromise-tokenize.js

Large diffs are not rendered by default.

33 changes: 22 additions & 11 deletions builds/compromise.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* compromise 13.10.4 MIT */
/* compromise 13.10.5 MIT */
(function (global, factory) {
typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() :
typeof define === 'function' && define.amd ? define(factory) :
Expand Down Expand Up @@ -199,7 +199,7 @@
var periodAcronym = /([A-Z]\.)+[A-Z]?,?$/;
var oneLetterAcronym$1 = /^[A-Z]\.,?$/;
var noPeriodAcronym = /[A-Z]{2,}('s|,)?$/;
var lowerCaseAcronym = /([a-z]\.){1,}[a-z]\.?$/;
var lowerCaseAcronym = /([a-z]\.)+[a-z]\.?$/;

var isAcronym$2 = function isAcronym(str) {
//like N.D.A
Expand Down Expand Up @@ -302,8 +302,8 @@
//all punctuation marks, from https://en.wikipedia.org/wiki/Punctuation
//we have slightly different rules for start/end - like #hashtags.

var startings = /^[ \n\t\.’'\[\](){}⟨⟩:,،、‒–—―…!.‹›«»‐\-?‘’;\/⁄·&*•^†‡°¡¿※№÷׺ª%‰+−=‱¶′″‴§~|‖¦©℗®℠™¤₳฿\u0022|\uFF02|\u0027|\u201C|\u2018|\u201F|\u201B|\u201E|\u2E42|\u201A|\u00AB|\u2039|\u2035|\u2036|\u2037|\u301D|\u0060|\u301F]+/;
var endings = /[ \n\t\.'\[\](){}⟨⟩:,،、‒–—―…!.‹›«»‐\-?‘’;\/⁄·&*@•^†‡°¡¿※#№÷׺ª‰+−=‱¶′″‴§~|‖¦©℗®℠™¤₳฿\u0022|\uFF02|\u0027|\u201D|\u2019|\u201D|\u2019|\u201D|\u201D|\u2019|\u00BB|\u203A|\u2032|\u2033|\u2034|\u301E|\u00B4|\u301E]+$/; //money = ₵¢₡₢$₫₯֏₠€ƒ₣₲₴₭₺₾ℳ₥₦₧₱₰£៛₽₹₨₪৳₸₮₩¥
var startings = /^[ \n\t\.\[\](){}⟨⟩:,،、‒–—―…!‹›«»‐\-?‘’;\/⁄·&*•^†‡°¡¿※№÷׺ª%‰+−=‱¶′″‴§~\|‖¦©℗®℠™¤₳฿\u0022\uFF02\u0027\u201C\u201F\u201B\u201E\u2E42\u201A\u2035\u2036\u2037\u301D\u0060\u301F]+/;
var endings = /[ \n\t\.'\[\](){}⟨⟩:,،、‒–—―…!‹›«»‐\-?‘’;\/⁄·&*@•^†‡°¡¿※#№÷׺ª‰+−=‱¶′″‴§~\|‖¦©℗®℠™¤₳฿\u0022\uFF02\u201D\u00B4\u301E]+$/; //money = ₵¢₡₢$₫₯֏₠€ƒ₣₲₴₭₺₾ℳ₥₦₧₱₰£៛₽₹₨₪৳₸₮₩¥

var hasSlash = /\//;
var hasApostrophe = /['’]/;
Expand Down Expand Up @@ -2671,7 +2671,7 @@
*/
var hasMinMax = /\{([0-9]+,?[0-9]*)\}/;
var andSign = /&&/;
var captureName = new RegExp(/^< *?(\S+) *?>/);
var captureName = new RegExp(/^<\s*?(\S+)\s*?>/);

var titleCase$2 = function titleCase(str) {
return str.charAt(0).toUpperCase() + str.substr(1);
Expand Down Expand Up @@ -3504,7 +3504,7 @@
var hasEllipse = /(?:\u2026|\.{2,}) *$/;
var newLine = /((?:\r?\n|\r)+)/; // Match different new-line formats

var hasLetter = /[a-z0-9\u00C0-\u00FF\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]/i;
var hasLetter = /[a-z0-9\u00C0-\u00FF\u00a9\u00ae\u2000-\u3300\ud000-\udfff]/i;
var startWhitespace = /^\s+/; // Start with a regex:

var naiive_split = function naiive_split(text) {
Expand Down Expand Up @@ -3866,7 +3866,7 @@

var fromJSON_1 = fromJSON;

var _version = '13.10.4';
var _version = '13.10.5';

var entity = ['Person', 'Place', 'Organization'];
var nouns$1 = {
Expand Down Expand Up @@ -5780,7 +5780,7 @@
}
}],
y: [{
reg: /([i|f|rr])y$/i,
reg: /(i|f|rr)y$/i,
repl: {
pr: '$1ies',
pa: '$1ied',
Expand Down Expand Up @@ -9478,8 +9478,9 @@
//fraction
[/^[0-9]{1,4}\/[0-9]{1,4}(st|nd|rd|th)?s?$/, ['Fraction', 'NumericValue']], //3/2ths
//range
[/^[0-9.]{1,2}[-–][0-9]{1,2}$/, ['Value', 'NumberRange']], //7-8
[/^[0-9.]{1,3}(st|nd|rd|th)?[-–][0-9\.]{1,3}(st|nd|rd|th)?$/, 'NumberRange'], //5-7
[/^[0-9.]{1,3}[a-z]{0,2}[-–—][0-9]{1,3}[a-z]{0,2}$/, ['Value', 'NumberRange']], //7th-8th
//time-range
[/^[0-9][0-9]?(:[0-9][0-9])?(am|pm)?[-–—][0-9][0-9]?(:[0-9][0-9])?(am|pm)?$/, ['Time', 'NumberRange']], //7pm-8:30
//with unit
[/^[0-9.]+([a-z]{1,4})$/, 'Value'] //like 5tbsp
//ordinal
Expand Down Expand Up @@ -10657,7 +10658,8 @@

var _05PerfectTense = checkPerfect;

var isRange = /^([0-9]{1,3}(?:st|nd|rd|th)?)[-–—]([0-9]{1,3}(?:st|nd|rd|th)?)$/i; //split '2-4' into '2 to 4'
var isRange = /^([0-9.]{1,3}[a-z]{0,2})[-–—]([0-9]{1,3}[a-z]{0,2})$/i;
var timeRange = /^([0-9][0-9]?(:[0-9][0-9])?(am|pm)?)[-–—]([0-9][0-9]?(:[0-9][0-9])?(am|pm)?)$/i; //split '2-4' into '2 to 4'

var checkRange = function checkRange(term) {
if (term.tags.PhoneNumber === true) {
Expand All @@ -10668,6 +10670,12 @@

if (parts !== null) {
return [parts[1], 'to', parts[2]];
} else {
parts = term.text.match(timeRange);

if (parts !== null) {
return [parts[1], 'to', parts[4]];
}
}

return null;
Expand Down Expand Up @@ -10719,6 +10727,7 @@

var isNumber = /^[0-9]+$/;
var isOrdinal = /^[0-9]+(st|nd|rd|th)$/;
var isTime = /^[0-9:]+(am|pm)$/;

var createPhrase = function createPhrase(found, doc) {
//create phrase from ['would', 'not']
Expand All @@ -10739,6 +10748,8 @@
t.tag('Cardinal', 'num-range', doc.world);
} else if (isOrdinal.test(t.implicit)) {
t.tag('Ordinal', 'ord-range', doc.world);
} else if (isTime.test(t.implicit)) {
t.tag('Time', 'time-range', doc.world);
} else if (Object.keys(t.tags).length === 0) {
t.tags.Noun = true; // if no tag, give it a noun
}
Expand Down
2 changes: 1 addition & 1 deletion builds/compromise.min.js

Large diffs are not rendered by default.

33 changes: 22 additions & 11 deletions builds/compromise.mjs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* compromise 13.10.4 MIT */
/* compromise 13.10.5 MIT */
function _typeof(obj) {
"@babel/helpers - typeof";

Expand Down Expand Up @@ -193,7 +193,7 @@ var unicode_1 = killUnicode; // console.log(killUnicode('bjŏȒk—Ɏó'));
var periodAcronym = /([A-Z]\.)+[A-Z]?,?$/;
var oneLetterAcronym$1 = /^[A-Z]\.,?$/;
var noPeriodAcronym = /[A-Z]{2,}('s|,)?$/;
var lowerCaseAcronym = /([a-z]\.){1,}[a-z]\.?$/;
var lowerCaseAcronym = /([a-z]\.)+[a-z]\.?$/;

var isAcronym$2 = function isAcronym(str) {
//like N.D.A
Expand Down Expand Up @@ -296,8 +296,8 @@ var reduce = reduced;
//all punctuation marks, from https://en.wikipedia.org/wiki/Punctuation
//we have slightly different rules for start/end - like #hashtags.

var startings = /^[ \n\t\.’'\[\](){}⟨⟩:,،、‒–—―…!.‹›«»‐\-?‘’;\/⁄·&*•^†‡°¡¿※№÷׺ª%‰+−=‱¶′″‴§~|‖¦©℗®℠™¤₳฿\u0022|\uFF02|\u0027|\u201C|\u2018|\u201F|\u201B|\u201E|\u2E42|\u201A|\u00AB|\u2039|\u2035|\u2036|\u2037|\u301D|\u0060|\u301F]+/;
var endings = /[ \n\t\.'\[\](){}⟨⟩:,،、‒–—―…!.‹›«»‐\-?‘’;\/⁄·&*@•^†‡°¡¿※#№÷׺ª‰+−=‱¶′″‴§~|‖¦©℗®℠™¤₳฿\u0022|\uFF02|\u0027|\u201D|\u2019|\u201D|\u2019|\u201D|\u201D|\u2019|\u00BB|\u203A|\u2032|\u2033|\u2034|\u301E|\u00B4|\u301E]+$/; //money = ₵¢₡₢$₫₯֏₠€ƒ₣₲₴₭₺₾ℳ₥₦₧₱₰£៛₽₹₨₪৳₸₮₩¥
var startings = /^[ \n\t\.\[\](){}⟨⟩:,،、‒–—―…!‹›«»‐\-?‘’;\/⁄·&*•^†‡°¡¿※№÷׺ª%‰+−=‱¶′″‴§~\|‖¦©℗®℠™¤₳฿\u0022\uFF02\u0027\u201C\u201F\u201B\u201E\u2E42\u201A\u2035\u2036\u2037\u301D\u0060\u301F]+/;
var endings = /[ \n\t\.'\[\](){}⟨⟩:,،、‒–—―…!‹›«»‐\-?‘’;\/⁄·&*@•^†‡°¡¿※#№÷׺ª‰+−=‱¶′″‴§~\|‖¦©℗®℠™¤₳฿\u0022\uFF02\u201D\u00B4\u301E]+$/; //money = ₵¢₡₢$₫₯֏₠€ƒ₣₲₴₭₺₾ℳ₥₦₧₱₰£៛₽₹₨₪৳₸₮₩¥

var hasSlash = /\//;
var hasApostrophe = /['’]/;
Expand Down Expand Up @@ -2665,7 +2665,7 @@ var _01ParseBlocks = parseBlocks; // console.log('(one two) (upto) [<snooze_to>#
*/
var hasMinMax = /\{([0-9]+,?[0-9]*)\}/;
var andSign = /&&/;
var captureName = new RegExp(/^< *?(\S+) *?>/);
var captureName = new RegExp(/^<\s*?(\S+)\s*?>/);

var titleCase$2 = function titleCase(str) {
return str.charAt(0).toUpperCase() + str.substr(1);
Expand Down Expand Up @@ -3498,7 +3498,7 @@ var isAcronym$1 = /[ .][A-Z]\.? *$/i;
var hasEllipse = /(?:\u2026|\.{2,}) *$/;
var newLine = /((?:\r?\n|\r)+)/; // Match different new-line formats

var hasLetter = /[a-z0-9\u00C0-\u00FF\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]/i;
var hasLetter = /[a-z0-9\u00C0-\u00FF\u00a9\u00ae\u2000-\u3300\ud000-\udfff]/i;
var startWhitespace = /^\s+/; // Start with a regex:

var naiive_split = function naiive_split(text) {
Expand Down Expand Up @@ -3860,7 +3860,7 @@ var fromJSON = function fromJSON(json, world) {

var fromJSON_1 = fromJSON;

var _version = '13.10.4';
var _version = '13.10.5';

var entity = ['Person', 'Place', 'Organization'];
var nouns$1 = {
Expand Down Expand Up @@ -5774,7 +5774,7 @@ var endsWith$1 = {
}
}],
y: [{
reg: /([i|f|rr])y$/i,
reg: /(i|f|rr)y$/i,
repl: {
pr: '$1ies',
pa: '$1ied',
Expand Down Expand Up @@ -9472,8 +9472,9 @@ var startsWith = [//web tags
//fraction
[/^[0-9]{1,4}\/[0-9]{1,4}(st|nd|rd|th)?s?$/, ['Fraction', 'NumericValue']], //3/2ths
//range
[/^[0-9.]{1,2}[-–][0-9]{1,2}$/, ['Value', 'NumberRange']], //7-8
[/^[0-9.]{1,3}(st|nd|rd|th)?[-–][0-9\.]{1,3}(st|nd|rd|th)?$/, 'NumberRange'], //5-7
[/^[0-9.]{1,3}[a-z]{0,2}[-–—][0-9]{1,3}[a-z]{0,2}$/, ['Value', 'NumberRange']], //7th-8th
//time-range
[/^[0-9][0-9]?(:[0-9][0-9])?(am|pm)?[-–—][0-9][0-9]?(:[0-9][0-9])?(am|pm)?$/, ['Time', 'NumberRange']], //7pm-8:30
//with unit
[/^[0-9.]+([a-z]{1,4})$/, 'Value'] //like 5tbsp
//ordinal
Expand Down Expand Up @@ -10651,7 +10652,8 @@ var checkPerfect = function checkPerfect(term, phrase) {

var _05PerfectTense = checkPerfect;

var isRange = /^([0-9]{1,3}(?:st|nd|rd|th)?)[-–—]([0-9]{1,3}(?:st|nd|rd|th)?)$/i; //split '2-4' into '2 to 4'
var isRange = /^([0-9.]{1,3}[a-z]{0,2})[-–—]([0-9]{1,3}[a-z]{0,2})$/i;
var timeRange = /^([0-9][0-9]?(:[0-9][0-9])?(am|pm)?)[-–—]([0-9][0-9]?(:[0-9][0-9])?(am|pm)?)$/i; //split '2-4' into '2 to 4'

var checkRange = function checkRange(term) {
if (term.tags.PhoneNumber === true) {
Expand All @@ -10662,6 +10664,12 @@ var checkRange = function checkRange(term) {

if (parts !== null) {
return [parts[1], 'to', parts[2]];
} else {
parts = term.text.match(timeRange);

if (parts !== null) {
return [parts[1], 'to', parts[4]];
}
}

return null;
Expand Down Expand Up @@ -10713,6 +10721,7 @@ var _07French = checkFrench;

var isNumber = /^[0-9]+$/;
var isOrdinal = /^[0-9]+(st|nd|rd|th)$/;
var isTime = /^[0-9:]+(am|pm)$/;

var createPhrase = function createPhrase(found, doc) {
//create phrase from ['would', 'not']
Expand All @@ -10733,6 +10742,8 @@ var createPhrase = function createPhrase(found, doc) {
t.tag('Cardinal', 'num-range', doc.world);
} else if (isOrdinal.test(t.implicit)) {
t.tag('Ordinal', 'ord-range', doc.world);
} else if (isTime.test(t.implicit)) {
t.tag('Time', 'time-range', doc.world);
} else if (Object.keys(t.tags).length === 0) {
t.tags.Noun = true; // if no tag, give it a noun
}
Expand Down
5 changes: 5 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ While all _Major_ releases should be reviewed, our only two _large_ releases are

<!-- #### [Unreleased]
-->
#### 13.10.5 [March 2021]
- **[new]** - support Time-range like '3pm-4pm'
- **[change]** - cleanup some unicode regexes
*plugin-releases*: dates

#### 13.10.4 [March 2021]
- **[fix]** - match syntax tokenization fix
- **[change]** - improved performance monitoring
Expand Down
Loading

0 comments on commit 652196e

Please sign in to comment.