Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Microserviceify libpostal #1060

Merged
merged 7 commits into from
Nov 28, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .npmrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
package-lock=false
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# base image
FROM pelias/libpostal_baseimage
FROM pelias/baseimage

# maintainer information
LABEL maintainer="pelias@mapzen.com"
Expand Down
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ The API recognizes the following properties under the top-level `api` key in you
|parameter|required|default|description|
|---|---|---|---|
|`host`|*yes*||specifies the url under which the http service is to run|
|`textAnalyzer`|*no*|*addressit*|can be either `libpostal` or `addressit` however will soon be **deprecated** and only `libpostal` will be supported going forward|
|`indexName`|*no*|*pelias*|name of the Elasticsearch index to be used when building queries|
|`relativeScores`|*no*|true|if set to true, confidence scores will be normalized, realistically at this point setting this to false is not tested or desirable
|`accessLog`|*no*||name of the format to use for access logs; may be any one of the [predefined values](https://github.com/expressjs/morgan#predefined-formats) in the `morgan` package. Defaults to `"common"`; if set to `false`, or an otherwise falsy value, disables access-logging entirely.|
Expand All @@ -66,7 +65,6 @@ Example configuration file would look something like this:
"host": "localhost:3100/v1/",
"indexName": "foobar",
"relativeScores": true,
"textAnalyzer": "libpostal",
"services": {
"pip": {
"url": "http://mypipservice.com:3000"
Expand Down
105 changes: 91 additions & 14 deletions controller/libpostal.js
Original file line number Diff line number Diff line change
@@ -1,31 +1,108 @@
const text_analyzer = require('pelias-text-analyzer');
const _ = require('lodash');
const iso3166 = require('iso3166-1');
const Debug = require('../helper/debug');
const debugLog = new Debug('controller:libpostal');
const logger = require('pelias-logger').get('api');

function setup(should_execute) {
// mapping object from libpostal fields to pelias fields
var field_mapping = {
island: 'island',
category: 'category',
house: 'query',
house_number: 'number',
road: 'street',
suburb: 'neighbourhood',
city_district: 'borough',
city: 'city',
state_district: 'county',
state: 'state',
postcode: 'postalcode',
country: 'country'
};

// This controller calls the hosted libpostal service and converts the response
// to a generic format for later use. The hosted service returns an array like:
//
// ```
// [
// {
// label: 'house_number',
// value: '30'
// },
// {
// label: 'road',
// value: 'west 26th street'
// },
// {
// label: 'city',
// value: 'new york'
// },
// {
// label: 'state',
// value: 'ny'
// }
//]
// ```
//
// where `label` can be any of (currently):
// - house (generally interpreted as unknown, treated by pelias like a query term)
// - category (like "restaurants")
// - house_number
// - road
// - unit (apt or suite #)
// - suburb (like a neighbourhood)
// - city
// - city_district (like an NYC borough)
// - state_district (like a county)
// - state
// - postcode
// - country
//
// The Pelias query module is not concerned with unit.
//
function setup(libpostalService, should_execute) {
function controller( req, res, next ){
// bail early if req/res don't pass conditions for execution
if (!should_execute(req, res)) {
return next();
}

const initialTime = debugLog.beginTimer(req);
// parse text with query parser
const parsed_text = text_analyzer.parse(req.clean.text);

if (parsed_text !== undefined) {
// if a known ISO2 country was parsed, convert it to ISO3
if (_.has(parsed_text, 'country') && iso3166.is2(_.toUpper(parsed_text.country))) {
parsed_text.country = iso3166.to3(_.toUpper(parsed_text.country));
libpostalService(req, (err, response) => {
if (err) {
// push err.message or err onto req.errors
req.errors.push( _.get(err, 'message', err) );

} else if (_.some(_.countBy(response, o => o.label), count => count > 1)) {
logger.warn(`discarding libpostal parse of '${req.clean.text}' due to duplicate field assignments`);
return next();

} else if (_.isEmpty(response)) {
return next();

} else {
req.clean.parser = 'libpostal';
req.clean.parsed_text = response.reduce(function(o, f) {
if (field_mapping.hasOwnProperty(f.label)) {
o[field_mapping[f.label]] = f.value;
}

return o;
}, {});

if (_.has(req.clean.parsed_text, 'country') && iso3166.is2(_.toUpper(req.clean.parsed_text.country))) {
req.clean.parsed_text.country = iso3166.to3(_.toUpper(req.clean.parsed_text.country));
}

debugLog.push(req, {parsed_text: req.clean.parsed_text});

}

req.clean.parser = 'libpostal';
req.clean.parsed_text = parsed_text;
debugLog.push(req, {parsed_text: req.clean.parsed_text});
}
debugLog.stopTimer(req, initialTime);
return next();
debugLog.stopTimer(req, initialTime);
return next();

});

}

Expand Down
75 changes: 75 additions & 0 deletions controller/structured_libpostal.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
const _ = require('lodash');
const Debug = require('../helper/debug');
const debugLog = new Debug('controller:libpostal');
const logger = require('pelias-logger').get('api');

// if there's a house_number in the libpostal response, return it
// otherwise return the postcode field (which may be undefined)
function findHouseNumberField(response) {
const house_number_field = response.find(f => f.label === 'house_number');

if (house_number_field) {
return house_number_field;
}

return response.find(f => f.label === 'postcode');

}

function setup(libpostalService, should_execute) {
function controller( req, res, next ){
// bail early if req/res don't pass conditions for execution
if (!should_execute(req, res)) {
return next();
}

const initialTime = debugLog.beginTimer(req);

libpostalService(req, (err, response) => {
if (err) {
// push err.message or err onto req.errors
req.errors.push( _.get(err, 'message', err) );

} else {
// figure out which field contains the probable house number, prefer house_number
// libpostal parses some inputs, like `3370 cobbe ave`, as a postcode+street
// so because we're treating the entire field as a street address, it's safe
// to assume that an identified postcode is actually a house number.
const house_number_field = findHouseNumberField(response);

// if we're fairly certain that libpostal identified a house number
// (from either the house_number or postcode field), place it into the
// number field and remove the first instance of that value from address
// and assign to street
// eg - '1090 N Charlotte St' becomes number=1090 and street=N Charlotte St
if (house_number_field) {
req.clean.parsed_text.number = house_number_field.value;

// remove the first instance of the number and trim whitespace
req.clean.parsed_text.street = _.trim(_.replace(req.clean.parsed_text.address, req.clean.parsed_text.number, ''));

} else {
// otherwise no house number was identifiable, so treat the entire input
// as a street
req.clean.parsed_text.street = req.clean.parsed_text.address;

}

// the address field no longer means anything since it's been parsed, so remove it
delete req.clean.parsed_text.address;

debugLog.push(req, {parsed_text: response});

}

debugLog.stopTimer(req, initialTime);
return next();

});

}

return controller;
}

module.exports = setup;
1 change: 0 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
"pelias-model": "5.2.0",
"pelias-query": "9.1.1",
"pelias-sorting": "1.1.0",
"pelias-text-analyzer": "1.10.2",
"predicates": "^2.0.0",
"retry": "^0.10.1",
"stats-lite": "^2.0.4",
Expand Down
23 changes: 22 additions & 1 deletion routes/v1.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ var controllers = {
coarse_reverse: require('../controller/coarse_reverse'),
mdToHTML: require('../controller/markdownToHtml'),
libpostal: require('../controller/libpostal'),
structured_libpostal: require('../controller/structured_libpostal'),
place: require('../controller/place'),
placeholder: require('../controller/placeholder'),
search: require('../controller/search'),
Expand Down Expand Up @@ -96,6 +97,7 @@ const PlaceHolder = require('../service/configurations/PlaceHolder');
const PointInPolygon = require('../service/configurations/PointInPolygon');
const Language = require('../service/configurations/Language');
const Interpolation = require('../service/configurations/Interpolation');
const Libpostal = require('../service/configurations/Libpostal');

/**
* Append routes to app
Expand All @@ -122,6 +124,18 @@ function addRoutes(app, peliasConfig) {
const interpolationService = serviceWrapper(interpolationConfiguration);
const isInterpolationEnabled = _.constant(interpolationConfiguration.isEnabled());

// standard libpostal should use req.clean.text for the `address` parameter
const libpostalConfiguration = new Libpostal(
_.defaultTo(peliasConfig.api.services.libpostal, {}),
_.property('clean.text'));
const libpostalService = serviceWrapper(libpostalConfiguration);

// structured libpostal should use req.clean.parsed_text.address for the `address` parameter
const structuredLibpostalConfiguration = new Libpostal(
_.defaultTo(peliasConfig.api.services.libpostal, {}),
_.property('clean.parsed_text.address'));
const structuredLibpostalService = serviceWrapper(structuredLibpostalConfiguration);

// fallback to coarse reverse when regular reverse didn't return anything
const coarseReverseShouldExecute = all(
isPipServiceEnabled, not(hasRequestErrors), not(hasResponseData)
Expand All @@ -132,6 +146,12 @@ function addRoutes(app, peliasConfig) {
not(isRequestSourcesOnlyWhosOnFirst)
);

// for libpostal to execute for structured requests, req.clean.parsed_text.address must exist
const structuredLibpostalShouldExecute = all(
not(hasRequestErrors),
hasParsedTextProperties.all('address')
);

// execute placeholder if libpostal only parsed as admin-only and needs to
// be geodisambiguated
const placeholderGeodisambiguationShouldExecute = all(
Expand Down Expand Up @@ -256,7 +276,7 @@ function addRoutes(app, peliasConfig) {
sanitizers.search.middleware(peliasConfig.api),
middleware.requestLanguage,
middleware.calcSize(),
controllers.libpostal(libpostalShouldExecute),
controllers.libpostal(libpostalService, libpostalShouldExecute),
controllers.placeholder(placeholderService, geometricFiltersApply, placeholderGeodisambiguationShouldExecute),
controllers.placeholder(placeholderService, geometricFiltersDontApply, placeholderIdsLookupShouldExecute),
controllers.search_with_ids(peliasConfig.api, esclient, queries.address_using_ids, searchWithIdsShouldExecute),
Expand Down Expand Up @@ -286,6 +306,7 @@ function addRoutes(app, peliasConfig) {
sanitizers.structured_geocoding.middleware(peliasConfig.api),
middleware.requestLanguage,
middleware.calcSize(),
controllers.structured_libpostal(structuredLibpostalService, structuredLibpostalShouldExecute),
controllers.search(peliasConfig.api, esclient, queries.structured_geocoding, not(hasResponseDataOrRequestErrors)),
postProc.trimByGranularityStructured(),
postProc.distances('focus.point.'),
Expand Down
44 changes: 1 addition & 43 deletions sanitizer/_synthesize_analysis.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
const _ = require('lodash');
const text_analyzer = require('pelias-text-analyzer');

const fields = {
'venue': 'query',
Expand All @@ -17,20 +16,6 @@ function normalizeWhitespaceToSingleSpace(val) {
return _.replace(_.trim(val), /\s+/g, ' ');
}

function isPostalCodeOnly(parsed_text) {
return Object.keys(parsed_text).length === 1 &&
parsed_text.hasOwnProperty('postalcode');
}

// figure out which field contains the probable house number, prefer number
// libpostal parses some inputs, like `3370 cobbe ave`, as a postcode+street
// so because we're treating the entire field as a street address, it's safe
// to assume that an identified postcode is actually a house number.
function getHouseNumberField(analyzed_address) {
// return the first field available in the libpostal response, undefined if none
return _.find(['number', 'postalcode'], _.partial(_.has, analyzed_address));
}

function _sanitize( raw, clean ){

// error & warning messages
Expand All @@ -51,35 +36,8 @@ function _sanitize( raw, clean ){
`at least one of the following fields is required: ${Object.keys(fields).join(', ')}`);
}

if (clean.parsed_text.hasOwnProperty('address')) {
const analyzed_address = text_analyzer.parse(clean.parsed_text.address);

const house_number_field = getHouseNumberField(analyzed_address);

// if we're fairly certain that libpostal identified a house number
// (from either the house_number or postcode field), place it into the
// number field and remove the first instance of that value from address
// and assign to street
// eg - '1090 N Charlotte St' becomes number=1090 and street=N Charlotte St
if (house_number_field) {
clean.parsed_text.number = analyzed_address[house_number_field];

// remove the first instance of the number and trim whitespace
clean.parsed_text.street = _.trim(_.replace(clean.parsed_text.address, clean.parsed_text.number, ''));

} else {
// otherwise no house number was identifiable, so treat the entire input
// as a street
clean.parsed_text.street = clean.parsed_text.address;

}

// the address field no longer means anything since it's been parsed, so remove it
delete clean.parsed_text.address;

}

return messages;

}

function _expected() {
Expand Down
5 changes: 5 additions & 0 deletions schema.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ module.exports = Joi.object().keys({
url: Joi.string().uri({ scheme: /https?/ }),
timeout: Joi.number().integer().optional().default(250).min(0),
retries: Joi.number().integer().optional().default(3).min(0),
}).unknown(false).requiredKeys('url'),
libpostal: Joi.object().keys({
url: Joi.string().uri({ scheme: /https?/ }),
timeout: Joi.number().integer().optional().default(250).min(0),
retries: Joi.number().integer().optional().default(3).min(0),
}).unknown(false).requiredKeys('url')
}).unknown(false).default({}), // default api.services to an empty object
defaultParameters: Joi.object().keys({
Expand Down
Loading