Skip to content

Commit

Permalink
Merge pull request #826 from spencermountain/dev
Browse files Browse the repository at this point in the history
13.10.3
  • Loading branch information
spencermountain authored Mar 18, 2021
2 parents 1f3fab9 + cc93664 commit 1e0a9ba
Show file tree
Hide file tree
Showing 28 changed files with 943 additions and 341 deletions.
2 changes: 1 addition & 1 deletion builds/compromise-tokenize.js

Large diffs are not rendered by default.

133 changes: 65 additions & 68 deletions builds/compromise.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* compromise 13.10.2 MIT */
/* compromise 13.10.3 MIT */
(function (global, factory) {
typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() :
typeof define === 'function' && define.amd ? define(factory) :
Expand Down Expand Up @@ -2591,6 +2591,62 @@

var _04PostProcess = postProcess$1;

// supported suffix-flags:
// suffixes: ? ] + * $ {2,6} ~
// [\?\]\+\*\$~]*
// prefixes: ! [ ^
// [\!\[\^]*
// match 'foo /yes/' and not 'foo/no/bar'
var bySlashes = /(?:^|\s)([\!\[\^]*(?:<[^<]*>)?\/.*?[^\\\/]\/[\?\]\+\*\$~]*)(?:\s|$)/g; // match '(yes) but not foo(no)bar'

var byParentheses = /(?:^|\s)([\!\[\^]*(?:<[^<]*>)?\(.*?[^\\\)]\)[\?\]\+\*\$~]*)(?:\s|$)/g; // okay

var byWord = / /g;

var isBlock = function isBlock(str) {
return /^[\!\[\^]*(<[^<]*>)?\(/.test(str) && /\)[\?\]\+\*\$~]*$/.test(str);
};

var isReg = function isReg(str) {
return /^[\!\[\^]*(<[^<]*>)?\//.test(str) && /\/[\?\]\+\*\$~]*$/.test(str);
};

var cleanUp = function cleanUp(arr) {
arr = arr.map(function (str) {
return str.trim();
});
arr = arr.filter(function (str) {
return str;
});
return arr;
};

var parseBlocks = function parseBlocks(txt) {
// parse by /regex/ first
var arr = txt.split(bySlashes);
var res = []; // parse by (blocks), next

arr.forEach(function (str) {
res = res.concat(str.split(byParentheses));
});
res = cleanUp(res); // split by spaces, now

var _final = [];
res.forEach(function (str) {
if (isBlock(str)) {
_final.push(str);
} else if (isReg(str)) {
_final.push(str);
} else {
_final = _final.concat(str.split(byWord));
}
});
_final = cleanUp(_final);
return _final;
};

var _01ParseBlocks = parseBlocks; // console.log(parseBlocks(`[<num>#Value] [<currency>(mark|rand|won|rub|ore)] foo`))

/* break-down a match expression into this:
{
word:'',
Expand All @@ -2610,7 +2666,7 @@
*/
var hasMinMax = /\{([0-9]+,?[0-9]*)\}/;
var andSign = /&&/;
var captureName = new RegExp(/^<(\S+)>/);
var captureName = new RegExp(/^< *?(\S+) *?>/);

var titleCase$2 = function titleCase(str) {
return str.charAt(0).toUpperCase() + str.substr(1);
Expand Down Expand Up @@ -2806,7 +2862,7 @@
return obj;
};

var _01ParseToken = parseToken;
var _02ParseToken = parseToken;

// name any [unnamed] capture-groups with a number
var nameGroups = function nameGroups(tokens) {
Expand Down Expand Up @@ -2935,68 +2991,10 @@
return tokens;
};

var _02PostProcess = postProcess;

var hasReg = /[^[a-z]]\//g;
var _03PostProcess = postProcess;

var isArray$2 = function isArray(arr) {
return Object.prototype.toString.call(arr) === '[object Array]';
}; // don't split up a regular expression


var mergeRegexes = function mergeRegexes(arr) {
arr.forEach(function (s, i) {
var m = s.match(hasReg); // has 1 slash

if (m !== null && m.length === 1 && arr[i + 1]) {
// merge next one
arr[i] += arr[i + 1];
arr[i + 1] = ''; // try 2nd one

m = arr[i].match(hasReg);

if (m !== null && m.length === 1) {
arr[i] += arr[i + 2];
arr[i + 2] = '';
}
}
});
arr = arr.filter(function (s) {
return s;
});
return arr;
}; //split-up by (these things)


var byParentheses = function byParentheses(str) {
var arr = str.split(/([\^\[\!]*(?:<\S+>)?\(.*?\)[?+*]*\]?\$?)/);
arr = arr.map(function (s) {
return s.trim();
});

if (hasReg.test(str)) {
arr = mergeRegexes(arr);
}

return arr;
};

var byWords = function byWords(arr) {
var words = [];
arr.forEach(function (a) {
//keep brackets lumped together
if (/\(.*\)/.test(a)) {
words.push(a);
return;
}

var list = a.split(' ');
list = list.filter(function (w) {
return w;
});
words = words.concat(list);
});
return words;
}; //turn an array into a 'choices' list


Expand Down Expand Up @@ -3101,21 +3099,20 @@
input = String(input); //go for it?
}

var tokens = byParentheses(input); // console.log(tokens)
var tokens = _01ParseBlocks(input); //turn them into objects

tokens = byWords(tokens);
tokens = tokens.map(function (str) {
return _01ParseToken(str);
return _02ParseToken(str);
}); //clean up anything weird

tokens = _02PostProcess(tokens, opts); // add fuzzy limits, etc
tokens = _03PostProcess(tokens, opts); // add fuzzy limits, etc

tokens = addOptions(tokens, opts); // console.log(tokens)

return tokens;
};

var matchSyntax = syntax; // console.log(syntax('before [(united states|canadian)] after'))
var matchSyntax = syntax; // console.log(syntax('[#Copula (#Adverb|not)+?] (#Gerund|#PastTense)'))

// match an explicit sequence of term ids
// take a phrase and find any of the idBlocks in it
Expand Down Expand Up @@ -3864,7 +3861,7 @@

var fromJSON_1 = fromJSON;

var _version = '13.10.2';
var _version = '13.10.3';

var entity = ['Person', 'Place', 'Organization'];
var nouns$1 = {
Expand Down
2 changes: 1 addition & 1 deletion builds/compromise.min.js

Large diffs are not rendered by default.

133 changes: 65 additions & 68 deletions builds/compromise.mjs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* compromise 13.10.2 MIT */
/* compromise 13.10.3 MIT */
function _typeof(obj) {
"@babel/helpers - typeof";

Expand Down Expand Up @@ -2585,6 +2585,62 @@ var postProcess$1 = function postProcess(terms, regs, matches) {

var _04PostProcess = postProcess$1;

// supported suffix-flags:
// suffixes: ? ] + * $ {2,6} ~
// [\?\]\+\*\$~]*
// prefixes: ! [ ^
// [\!\[\^]*
// match 'foo /yes/' and not 'foo/no/bar'
var bySlashes = /(?:^|\s)([\!\[\^]*(?:<[^<]*>)?\/.*?[^\\\/]\/[\?\]\+\*\$~]*)(?:\s|$)/g; // match '(yes) but not foo(no)bar'

var byParentheses = /(?:^|\s)([\!\[\^]*(?:<[^<]*>)?\(.*?[^\\\)]\)[\?\]\+\*\$~]*)(?:\s|$)/g; // okay

var byWord = / /g;

var isBlock = function isBlock(str) {
return /^[\!\[\^]*(<[^<]*>)?\(/.test(str) && /\)[\?\]\+\*\$~]*$/.test(str);
};

var isReg = function isReg(str) {
return /^[\!\[\^]*(<[^<]*>)?\//.test(str) && /\/[\?\]\+\*\$~]*$/.test(str);
};

var cleanUp = function cleanUp(arr) {
arr = arr.map(function (str) {
return str.trim();
});
arr = arr.filter(function (str) {
return str;
});
return arr;
};

var parseBlocks = function parseBlocks(txt) {
// parse by /regex/ first
var arr = txt.split(bySlashes);
var res = []; // parse by (blocks), next

arr.forEach(function (str) {
res = res.concat(str.split(byParentheses));
});
res = cleanUp(res); // split by spaces, now

var _final = [];
res.forEach(function (str) {
if (isBlock(str)) {
_final.push(str);
} else if (isReg(str)) {
_final.push(str);
} else {
_final = _final.concat(str.split(byWord));
}
});
_final = cleanUp(_final);
return _final;
};

var _01ParseBlocks = parseBlocks; // console.log(parseBlocks(`[<num>#Value] [<currency>(mark|rand|won|rub|ore)] foo`))

/* break-down a match expression into this:
{
word:'',
Expand All @@ -2604,7 +2660,7 @@ var _04PostProcess = postProcess$1;
*/
var hasMinMax = /\{([0-9]+,?[0-9]*)\}/;
var andSign = /&&/;
var captureName = new RegExp(/^<(\S+)>/);
var captureName = new RegExp(/^< *?(\S+) *?>/);

var titleCase$2 = function titleCase(str) {
return str.charAt(0).toUpperCase() + str.substr(1);
Expand Down Expand Up @@ -2800,7 +2856,7 @@ var parseToken = function parseToken(w) {
return obj;
};

var _01ParseToken = parseToken;
var _02ParseToken = parseToken;

// name any [unnamed] capture-groups with a number
var nameGroups = function nameGroups(tokens) {
Expand Down Expand Up @@ -2929,68 +2985,10 @@ var postProcess = function postProcess(tokens) {
return tokens;
};

var _02PostProcess = postProcess;

var hasReg = /[^[a-z]]\//g;
var _03PostProcess = postProcess;

var isArray$2 = function isArray(arr) {
return Object.prototype.toString.call(arr) === '[object Array]';
}; // don't split up a regular expression


var mergeRegexes = function mergeRegexes(arr) {
arr.forEach(function (s, i) {
var m = s.match(hasReg); // has 1 slash

if (m !== null && m.length === 1 && arr[i + 1]) {
// merge next one
arr[i] += arr[i + 1];
arr[i + 1] = ''; // try 2nd one

m = arr[i].match(hasReg);

if (m !== null && m.length === 1) {
arr[i] += arr[i + 2];
arr[i + 2] = '';
}
}
});
arr = arr.filter(function (s) {
return s;
});
return arr;
}; //split-up by (these things)


var byParentheses = function byParentheses(str) {
var arr = str.split(/([\^\[\!]*(?:<\S+>)?\(.*?\)[?+*]*\]?\$?)/);
arr = arr.map(function (s) {
return s.trim();
});

if (hasReg.test(str)) {
arr = mergeRegexes(arr);
}

return arr;
};

var byWords = function byWords(arr) {
var words = [];
arr.forEach(function (a) {
//keep brackets lumped together
if (/\(.*\)/.test(a)) {
words.push(a);
return;
}

var list = a.split(' ');
list = list.filter(function (w) {
return w;
});
words = words.concat(list);
});
return words;
}; //turn an array into a 'choices' list


Expand Down Expand Up @@ -3095,21 +3093,20 @@ var syntax = function syntax(input) {
input = String(input); //go for it?
}

var tokens = byParentheses(input); // console.log(tokens)
var tokens = _01ParseBlocks(input); //turn them into objects

tokens = byWords(tokens);
tokens = tokens.map(function (str) {
return _01ParseToken(str);
return _02ParseToken(str);
}); //clean up anything weird

tokens = _02PostProcess(tokens, opts); // add fuzzy limits, etc
tokens = _03PostProcess(tokens, opts); // add fuzzy limits, etc

tokens = addOptions(tokens, opts); // console.log(tokens)

return tokens;
};

var matchSyntax = syntax; // console.log(syntax('before [(united states|canadian)] after'))
var matchSyntax = syntax; // console.log(syntax('[#Copula (#Adverb|not)+?] (#Gerund|#PastTense)'))

// match an explicit sequence of term ids
// take a phrase and find any of the idBlocks in it
Expand Down Expand Up @@ -3858,7 +3855,7 @@ var fromJSON = function fromJSON(json, world) {

var fromJSON_1 = fromJSON;

var _version = '13.10.2';
var _version = '13.10.3';

var entity = ['Person', 'Place', 'Organization'];
var nouns$1 = {
Expand Down
5 changes: 4 additions & 1 deletion changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@ compromise uses semver, and pushes to npm frequently
While all _Major_ releases should be reviewed, our only two _large_ releases are **v6** in 2016 and and **v12** in 2019. Others have been mostly incremental, or niche.

<!-- #### [Unreleased]
-->

#### 13.10.3 [March 2021]
- **[fix]** - support complicated regular-expressions in match syntax
- improved performance testing

#### 13.10.2 [March 2021]
- **[fix]** - support matching implicit terms in (or|blocks)
- **[change]** - add #Timezone tag (from date-plugin)
Expand Down
Loading

0 comments on commit 1e0a9ba

Please sign in to comment.