Skip to content

Commit

Permalink
Merge pull request #169 from pelias/ngram-address-parser
Browse files Browse the repository at this point in the history
Address Parser
  • Loading branch information
hkrishna committed Jul 23, 2015
2 parents 5401ed9 + fae9aae commit 1b6d3fe
Show file tree
Hide file tree
Showing 18 changed files with 788 additions and 60 deletions.
5 changes: 5 additions & 0 deletions controller/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ function setup( backend, query ){
cmd.type = req.clean.layers;
}

// set type if input suggests targeting a layer(s)
if (req.clean.default_layers_set && req.clean.parsed_input) {
cmd.type = req.clean.parsed_input.target_layer || cmd.type;
}

// query backend
service.search( backend, cmd, function( err, docs ){

Expand Down
13 changes: 13 additions & 0 deletions helper/admin_weights.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
/**
* These values specify how much a document that matches a certain _type
* should be boosted in elasticsearch results.
*/

module.exports = {
'admin0': 4,
'admin1': 3,
'admin2': 2,
'local_admin': 1,
'locality':1,
'neighborhood':1
};
93 changes: 93 additions & 0 deletions helper/query_parser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@

var parser = require('addressit');
var extend = require('extend');
var get_layers = require('../helper/layers');
var delim = ',';

module.exports = function(query) {

var tokenized = query.split(/[ ,]+/);
var hasNumber = /\d/.test(query);

var getAdminPartsBySplittingOnDelim = function(query) {
// naive approach - for admin matching during query time
// split 'flatiron, new york, ny' into 'flatiron' and 'new york, ny'
var delimIndex = query.indexOf(delim);
var address = {};
if ( delimIndex !== -1 ) {
address.name = query.substring(0, delimIndex);
address.admin_parts = query.substring(delimIndex + 1).trim();
}

return address;
};

var getTargetLayersWhenAddressParsingIsNotNecessary = function(query) {
var address = {};
// set target_layer if input length <= 3 characters
if (query.length <= 3 ) {
// no address parsing required
address.target_layer = get_layers(['admin']);
} else if (tokenized.length === 1 || (tokenized.length < 3 && !hasNumber)) {
// no need to hit address layers if there's only one (or two) token(s)
address.target_layer = get_layers(['admin', 'poi']);
}

return address.target_layer ? address : null;
};

var getAddressParts = function(query) {
// address parsing
var address = parser( query );
// set target_layer if input suggests no address
if (address.text === address.regions.join(' ') && !hasNumber) {
address.target_layer = get_layers(['admin', 'poi']);
}

return address;
};

var addressWithAdminParts = getAdminPartsBySplittingOnDelim(query);
var addressWithTargetLayers= getTargetLayersWhenAddressParsingIsNotNecessary(query);
var addressWithAddressParts= !addressWithTargetLayers ? getAddressParts(query) : {};

var parsedAddress = extend(addressWithAdminParts,
addressWithTargetLayers,
addressWithAddressParts);

var address_parts = [ 'name',
'number',
'street',
'city',
'state',
'country',
'postalcode',
'regions',
'admin_parts',
'target_layer'
];

var parsed_input = {};

address_parts.forEach(function(part){
if (parsedAddress[part]) {
parsed_input[part] = parsedAddress[part];
}
});

return parsed_input;
};


// parsed_input = {
// name : parsedAddress.name,
// number : parsedAddress.number,
// street : parsedAddress.street,
// city : parsedAddress.city,
// state : parsedAddress.state,
// country: parsedAddress.country,
// postalcode : parsedAddress.postalcode,
// regions: parsedAddress.regions,
// admin_parts: parsedAddress.admin_parts,
// target_layer: parsedAddress.target_layer
// }
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
"microtime": "1.4.0",
"morgan": "1.5.2",
"pelias-config": "^0.1.4",
"extend": "2.0.1",
"addressit": "1.3.0",
"pelias-esclient": "0.0.25",
"pelias-logger": "^0.0.8",
"pelias-suggester-pipeline": "2.0.2",
Expand Down
100 changes: 81 additions & 19 deletions query/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,42 +13,104 @@ function generate( params ){
}

var query = queries.distance( centroid, { size: params.size } );

var input = params.input;

if (params.bbox) {
query = queries.bbox ( centroid, { size: params.size, bbox: params.bbox } );
}

// add search condition to filtered query
query.query.filtered.query = {
'bool': {
'must': [{
'match': {
'name.default': params.input
}
}]
'must': [],
'should': []
}
};

// should query contitions
query.query.filtered.query.bool.should = [];
if (params.parsed_input) {

query.query.filtered.query.bool.should = [];

var unmatched_admin_fields = [];
// qb stands for query builder
var qb = function(unmatched_admin_fields, value) {
if (value) {
unmatched_admin_fields.forEach(function(admin_field) {
var match = {};
match[admin_field] = value;
query.query.filtered.query.bool.should.push({
'match': match
});
});
}
};

// update input
if (params.parsed_input.number && params.parsed_input.street) {
input = params.parsed_input.number + ' ' + params.parsed_input.street;
} else if (params.parsed_input.admin_parts) {
input = params.parsed_input.name;
}

if (params.input_admin) {
var admin_fields = ['admin0', 'admin1', 'admin1_abbr', 'admin2', 'alpha3'];
// address
// number, street, postalcode
if (params.parsed_input.number) {
qb(['address.number'], params.parsed_input.number);
}
if (params.parsed_input.street) {
qb(['address.street'], params.parsed_input.street);
}
if (params.parsed_input.postalcode) {
qb(['address.zip'], params.parsed_input.postalcode);
}

admin_fields.forEach(function(admin_field) {
var match = {};
match[admin_field] = params.input_admin;
query.query.filtered.query.bool.should.push({
'match': match
});
});
// city
// admin2, locality, local_admin, neighborhood
if (params.parsed_input.city) {
qb(['admin2'], params.parsed_input.admin2);
} else {
unmatched_admin_fields.push('admin2');
}

// state
// admin1, admin1_abbr
if (params.parsed_input.state) {
qb(['admin1_abbr'], params.parsed_input.state);
} else {
unmatched_admin_fields.push('admin1', 'admin1_abbr');
}

// country
// admin0, alpha3
if (params.parsed_input.country) {
qb(['alpha3'], params.parsed_input.country);
} else {
unmatched_admin_fields.push('admin0', 'alpha3');
}

var input_regions = params.parsed_input.regions ? params.parsed_input.regions.join(' ') : undefined;
// if no address was identified and input suggests some admin info in it
if (unmatched_admin_fields.length === 5 && input_regions !== params.input) {
if (params.parsed_input.admin_parts) {
qb(unmatched_admin_fields, params.parsed_input.admin_parts);
} else {
qb(unmatched_admin_fields, input_regions);
}
}

}

// add search condition to distance query
query.query.filtered.query.bool.must.push({
'match': {
'name.default': input
}
});

// add phrase matching query
// note: this is required for shingle/phrase matching
query.query.filtered.query.bool.should.push({
'match': {
'phrase.default': params.input
'phrase.default': input
}
});

Expand Down
13 changes: 12 additions & 1 deletion query/sort.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ var population = 'population';
var popularity = 'popularity';
var category = 'category';
var category_weights = require('../helper/category_weights');
var admin_weights = require('../helper/admin_weights');
var weights = require('pelias-suggester-pipeline').weights;
var isObject = require( 'is-object' );

Expand All @@ -15,6 +16,13 @@ module.exports = function( params ){
'order': 'desc'
}
},
{
'_script': {
'file': popularity,
'type': 'number',
'order': 'desc'
}
},
{
'_script': {
'file': population,
Expand All @@ -24,7 +32,10 @@ module.exports = function( params ){
},
{
'_script': {
'file': popularity,
'params': {
'weights': admin_weights
},
'file': 'weights',
'type': 'number',
'order': 'desc'
}
Expand Down
15 changes: 5 additions & 10 deletions sanitiser/_input.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
var isObject = require('is-object');
var isObject = require('is-object');
var query_parse= require('../helper/query_parser');

// validate inputs, convert types and apply defaults
function sanitize( req ){

req.clean = req.clean || {};
var params= req.query;
var delim = ',';


// ensure the input params are a valid object
if( !isObject( params ) ){
params = {};
Expand All @@ -22,13 +22,8 @@ function sanitize( req ){

req.clean.input = params.input;

// for admin matching during query time
// split 'flatiron, new york, ny' into 'flatiron' and 'new york, ny'
var delim_index = params.input.indexOf(delim);
if ( delim_index !== -1 ) {
req.clean.input = params.input.substring(0, delim_index);
req.clean.input_admin = params.input.substring(delim_index + 1).trim();
}
req.clean.parsed_input = query_parse(params.input);


return { 'error': false };

Expand Down
1 change: 1 addition & 0 deletions sanitiser/_layers.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ function sanitize( req ){
// default case (no layers specified in GET params)
if('string' !== typeof params.layers || !params.layers.length){
params.layers = 'poi,admin,address'; // default layers
clean.default_layers_set = true;
}

// decide which layers can be queried
Expand Down
Loading

0 comments on commit 1b6d3fe

Please sign in to comment.