Skip to content

Commit

Permalink
Use parse5 as a default parser (closes #863) (#985)
Browse files Browse the repository at this point in the history
* Use parse5 as a default parser (closes #863)

* Use documents via $.load

* Add test for #997

* Change options format
  • Loading branch information
inikulin authored Dec 20, 2020
1 parent 3368605 commit 6e115ee
Show file tree
Hide file tree
Showing 15 changed files with 253 additions and 99 deletions.
7 changes: 4 additions & 3 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ __ϟ Blazingly fast:__
Cheerio works with a very simple, consistent DOM model. As a result parsing, manipulating, and rendering are incredibly efficient.

__❁ Incredibly flexible:__
Cheerio wraps around @FB55's forgiving [htmlparser2](https://github.com/fb55/htmlparser2/). Cheerio can parse nearly any HTML or XML document.
Cheerio wraps around [parse5](https://github.com/inikulin/parse5) parser and can optionally use @FB55's forgiving [htmlparser2](https://github.com/fb55/htmlparser2/). Cheerio can parse nearly any HTML or XML document.

## Cheerio is not a web browser

Expand Down Expand Up @@ -273,14 +273,15 @@ const $ = cheerio.load('<ul id="fruits">...</ul>', {
});
```

These parsing options are taken directly from [htmlparser2](https://github.com/fb55/htmlparser2/wiki/Parser-options), therefore any options that can be used in `htmlparser2` are valid in cheerio as well. The default options are:
These parsing options are taken directly from [htmlparser2](https://github.com/fb55/htmlparser2/wiki/Parser-options), therefore any options that can be used in `htmlparser2` are valid in cheerio as well. If any of these options is set to non-default value cheerio will implicitly use `htmlparser2` as an underlying parser. In addition, you can use `useHtmlParser2` option to force cheerio use `htmlparser2` instead of `parse5`. The default options are:

```js
{
withDomLvl1: true,
normalizeWhitespace: false,
xmlMode: false,
decodeEntities: true
decodeEntities: true,
useHtmlParser2: false
}

```
Expand Down
4 changes: 2 additions & 2 deletions lib/api/attributes.js
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ exports.val = function(value) {
returnValue;
if (option === undefined) return undefined;
if (!querying) {
if (!this.attr().hasOwnProperty('multiple') && typeof value == 'object') {
if (!hasOwn.call(this.attr(), 'multiple') && typeof value == 'object') {
return this;
}
if (typeof value != 'object') {
Expand All @@ -283,7 +283,7 @@ exports.val = function(value) {
return this;
}
returnValue = option.attr('value');
if (this.attr().hasOwnProperty('multiple')) {
if (hasOwn.call(this.attr(), 'multiple')) {
returnValue = [];
domEach(option, function(__, el) {
returnValue.push(getAttr(el, 'value'));
Expand Down
4 changes: 2 additions & 2 deletions lib/api/manipulation.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ exports._makeDomArray = function makeDomArray(elem, clone) {
return this._makeDomArray(el, clone);
}, this));
} else if (typeof elem === 'string') {
return evaluate(elem, this.options);
return evaluate(elem, this.options, false);
} else {
return clone ? cloneDom([elem]) : [elem];
}
Expand Down Expand Up @@ -392,7 +392,7 @@ exports.html = function(str) {
child.next = child.prev = child.parent = null;
});

var content = str.cheerio ? str.clone().get() : evaluate('' + str, opts);
var content = str.cheerio ? str.clone().get() : evaluate('' + str, opts, false);

updateDOM(content, el);
});
Expand Down
21 changes: 6 additions & 15 deletions lib/cheerio.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
*/

var parse = require('./parse'),
defaultOptions = require('./options').default,
flattenOptions = require('./options').flatten,
isHtml = require('./utils').isHtml,
_ = {
extend: require('lodash/assignIn'),
Expand Down Expand Up @@ -30,13 +32,13 @@ var api = [
var Cheerio = module.exports = function(selector, context, root, options) {
if (!(this instanceof Cheerio)) return new Cheerio(selector, context, root, options);

this.options = _.defaults(options || {}, this.options);
this.options = _.defaults(flattenOptions(options), this.options, defaultOptions);

// $(), $(null), $(undefined), $(false)
if (!selector) return this;

if (root) {
if (typeof root === 'string') root = parse(root, this.options);
if (typeof root === 'string') root = parse(root, this.options, false);
this._root = Cheerio.call(this, root);
}

Expand All @@ -58,7 +60,7 @@ var Cheerio = module.exports = function(selector, context, root, options) {

// $(<html>)
if (typeof selector === 'string' && isHtml(selector)) {
return Cheerio.call(this, parse(selector, this.options).children);
return Cheerio.call(this, parse(selector, this.options, false).children);
}

// If we don't have a context, maybe we have a root, from loading
Expand All @@ -67,7 +69,7 @@ var Cheerio = module.exports = function(selector, context, root, options) {
} else if (typeof context === 'string') {
if (isHtml(context)) {
// $('li', '<ul>...</ul>')
context = parse(context, this.options);
context = parse(context, this.options, false);
context = Cheerio.call(this, context);
} else {
// $('li', 'ul')
Expand Down Expand Up @@ -98,17 +100,6 @@ _.extend(Cheerio, require('./static'));

Cheerio.prototype.cheerio = '[cheerio object]';

/*
* Cheerio default options
*/

Cheerio.prototype.options = {
withDomLvl1: true,
normalizeWhitespace: false,
xmlMode: false,
decodeEntities: true
};

/*
* Make cheerio an array-like object
*/
Expand Down
16 changes: 16 additions & 0 deletions lib/options.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
var assign = require('lodash/assign');

/*
* Cheerio default options
*/

exports.default = {
withDomLvl1: true,
normalizeWhitespace: false,
xmlMode: false,
decodeEntities: true
};

exports.flatten = function(options) {
return options && options.xml ? assign({xmlMode: true}, options.xml) : options;
};
28 changes: 21 additions & 7 deletions lib/parse.js
Original file line number Diff line number Diff line change
@@ -1,31 +1,45 @@
/*
Module Dependencies
*/
var htmlparser = require('htmlparser2');
var htmlparser = require('htmlparser2'),
parse5 = require('parse5');

/*
Parser
*/
exports = module.exports = function(content, options) {
var dom = exports.evaluate(content, options),
exports = module.exports = function(content, options, isDocument) {
var dom = exports.evaluate(content, options, isDocument),
// Generic root element
root = exports.evaluate('<root></root>', options)[0];
root = exports.evaluate('<root></root>', options, false)[0];

root.type = 'root';
root.parent = null;

// Update the dom using the root
exports.update(dom, root);

return root;
};

exports.evaluate = function(content, options) {
function parseWithParse5 (content, isDocument) {
var parse = isDocument ? parse5.parse : parse5.parseFragment,
root = parse(content, { treeAdapter: parse5.treeAdapters.htmlparser2 });

return root.children;
}

exports.evaluate = function(content, options, isDocument) {
// options = options || $.fn.options;

var dom;

if (typeof content === 'string' || Buffer.isBuffer(content)) {
dom = htmlparser.parseDOM(content, options);
if (Buffer.isBuffer(content))
content = content.toString();

if (typeof content === 'string') {
var useHtmlParser2 = options.xmlMode || options.useHtmlParser2;

dom = useHtmlParser2 ? htmlparser.parseDOM(content, options) : parseWithParse5(content, isDocument);
} else {
dom = content;
}
Expand Down
17 changes: 10 additions & 7 deletions lib/static.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
*/

var serialize = require('dom-serializer'),
defaultOptions = require('./options').default,
flattenOptions = require('./options').flatten,
select = require('css-select'),
parse = require('./parse'),
_ = {
Expand All @@ -14,12 +16,15 @@ var serialize = require('dom-serializer'),
* $.load(str)
*/

exports.load = function(content, options) {
exports.load = function(content, options, isDocument) {
var Cheerio = require('./cheerio');

options = _.defaults(options || {}, Cheerio.prototype.options);
options = _.defaults(flattenOptions(options || {}), defaultOptions);

var root = parse(content, options);
if (isDocument === void 0)
isDocument = true;

var root = parse(content, options, isDocument);

var initialize = function(selector, context, r, opts) {
if (!(this instanceof initialize)) {
Expand Down Expand Up @@ -75,8 +80,6 @@ function render(that, dom, options) {
*/

exports.html = function(dom, options) {
var Cheerio = require('./cheerio');

// be flexible about parameters, sometimes we call html(),
// with options as only parameter
// check dom argument for dom element specific properties
Expand All @@ -89,7 +92,7 @@ exports.html = function(dom, options) {

// sometimes $.html() used without preloading html
// so fallback non existing options to the default ones
options = _.defaults(options || {}, this._options, Cheerio.prototype.options);
options = _.defaults(flattenOptions(options || {}), this._options, defaultOptions);

return render(this, dom, options);
};
Expand Down Expand Up @@ -144,7 +147,7 @@ exports.parseHTML = function(data, context, keepScripts) {
keepScripts = context;
}

parsed = this.load(data);
parsed = this.load(data, defaultOptions, false);
if (!keepScripts) {
parsed('script').remove();
}
Expand Down
7 changes: 5 additions & 2 deletions lib/utils.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
var parse = require('./parse'),
render = require('dom-serializer');
render = require('dom-serializer'),
assign = require('lodash/assign');

/**
* HTML Tags
Expand Down Expand Up @@ -61,7 +62,9 @@ exports.domEach = function(cheerio, fn) {
* @argument {Object} options - The parsing/rendering options
*/
exports.cloneDom = function(dom, options) {
return parse(render(dom, options), options).children;
options = assign({}, options, { useHtmlParser2: true });

return parse(render(dom, options), options, false).children;
};

/*
Expand Down
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
"dom-serializer": "~0.1.1",
"entities": "~1.1.2",
"htmlparser2": "^3.10.1",
"lodash": "^4.17.11"
"lodash": "^4.17.11",
"parse5": "^3.0.1"
},
"devDependencies": {
"benchmark": "^2.1.4",
Expand Down
2 changes: 1 addition & 1 deletion test/api/manipulation.js
Original file line number Diff line number Diff line change
Expand Up @@ -1395,7 +1395,7 @@ describe('$(...)', function() {
});

it('() : should pass options', function() {
var dom = cheerio.load('&', {decodeEntities: false});
var dom = cheerio.load('&', {xml: {decodeEntities: false}});
expect(dom.root().toString()).to.equal('&');
});
});
Expand Down
Loading

0 comments on commit 6e115ee

Please sign in to comment.