Skip to content
This repository has been archived by the owner on Aug 12, 2021. It is now read-only.

Commit

Permalink
Basic solr field customization with preliminary spelling variants #81
Browse files Browse the repository at this point in the history
  • Loading branch information
rlskoeser committed Aug 6, 2018
1 parent d3c831f commit f4fcda5
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 2 deletions.
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ djiffy
annotator_store
cached_property
# unreleased version of SolrClient with ordered facets and facet ranges
git+https://github.com/moonlitesolutions/SolrClient.git@19c5280c9f8e97#egg=SolrClient
#git+https://github.com/moonlitesolutions/SolrClient.git@19c5280c9f8e97#egg=SolrClient
# unreleased version of SolrClient with schema field-type management
git+https://github.com/rlskoeser/SolrClient.git@schema-field-type-support19c5280c9f8e97#egg=SolrClient
#solrclient
progressbar2
unidecode
74 changes: 73 additions & 1 deletion winthrop/common/solr.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,58 @@ class SolrSchema(object):
'''Solr Schema object. Includes project schema configuration and
methods to update configured Solr instance.'''

field_types = [
{
'name': 'text_en',
"class":"solr.TextField",
# for now, configuring index and query analyzers the same
# if we want synonyms, query must be separate
"analyzer" : {
# "charFilters": [
# ],
"tokenizer": {
"class": "solr.StandardTokenizerFactory"
},
"filters": [
{"class":"solr.StopFilterFactory"},
{"class": "solr.LowerCaseFilterFactory"},
{"class": "solr.EnglishPossessiveFilterFactory"},
{"class": "solr.KeywordMarkerFilterFactory"},
{"class": "solr.PorterStemFilterFactory"},
{"class": "solr.ICUFoldingFilterFactory"},

# strip e at end of word
# (NOTE: does stemming already handle this?)
{"class":"solr.PatternReplaceFilterFactory",
"pattern": r'(\w+)e$', "replacement":"$1"},
# convert vv to w
{"class": "solr.PatternReplaceFilterFactory",
"pattern": r'vv', "replacement":"w"},
# treat all Js as Is
{"class":"solr.PatternReplaceFilterFactory",
"pattern": r'j', "replacement":"i"},
# treat all Vs as Us
{"class":"solr.PatternReplaceFilterFactory",
"pattern": r'v', "replacement":"u"},
]
}
},
{
'name': 'string',
"class":"solr.TextField",
"sortMissingLast": True,
"indexAnalyzer": {
"tokenizer": {
# treat entire field as a single token
"class": "solr.KeywordTokenizerFactory",
},
"filters": [
{"class": "solr.ICUFoldingFilterFactory"},
]
}
}
]

#: solr schema field definitions
fields = [
{'name': 'title', 'type': 'text_en', 'required': False},
Expand Down Expand Up @@ -58,7 +110,8 @@ class SolrSchema(object):
{'name': 'original_pub_info', 'type': 'text_en', 'required': False},
{'name': 'notes', 'type': 'text_en', 'required': False},

{'name': 'text', 'type': 'text_en', 'required': False, 'stored': False,
# stored = true while testing spelling variation support
{'name': 'text', 'type': 'text_en', 'required': False, 'stored': True,
'multiValued': True},

# have solr automatically track last modification time for
Expand Down Expand Up @@ -106,10 +159,29 @@ def solr_schema_fields(self):
schema_info = self.solr.schema.get_schema_fields(self.solr_collection)
return [field['name'] for field in schema_info['fields']]

def solr_schema_field_types(self):
'''Dictionary of currently configured Solr schema fields'''
response = self.solr.schema.get_schema_field_types(self.solr_collection)
return {field_type['name']: field_type for field_type in response['fieldTypes']}

def update_solr_schema(self):
'''Update the configured solr instance schema to match
the configured fields. Returns a tuple with the number of fields
created and updated.'''

current_field_types = self.solr_schema_field_types()

for field_type in self.field_types:
if field_type['name'] in current_field_types:
# if field exists but definition has changed, replace it
if field_type != current_field_types[field_type['name']]:
self.solr.schema.replace_field_type(self.solr_collection, field_type)
# otherwise, create as a new field
else:
self.solr.schema.create_field_type(self.solr_collection, field_type)

# TODO: deletion?

current_fields = self.solr_schema_fields()

created = updated = removed = 0
Expand Down

0 comments on commit f4fcda5

Please sign in to comment.