Skip to content

Solr Schema Configuration

willprice76 edited this page Dec 19, 2014 · 9 revisions

SI4T's schema.xml must have the following minimal structure:

<?xml version="1.0" ?>
<!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-->
<!-- 
	SI4T minimum schema example.
-->
<schema name="staging" version="1.1">
	<!-- Minimum field type definition. -->
	<types>
		<fieldType name="ignored" indexed="false" stored="false" class="solr.StrField" />
		<fieldtype name="string"  class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
		<fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
		<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
		<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
			<analyzer type="index">
				<tokenizer class="solr.StandardTokenizerFactory"/>
				<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
				<filter class="solr.LowerCaseFilterFactory"/>
			</analyzer>
			<analyzer type="query">
				<tokenizer class="solr.StandardTokenizerFactory"/>
				<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
				<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
				<filter class="solr.LowerCaseFilterFactory"/>
			</analyzer>
		</fieldType>
		<fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
	</types>
	<fields>
		<!-- SI4T Mandatory fields -->
					<field name="id"       type="string"    indexed="true"  stored="true"  multiValued="false" required="true"/>
					<field name="url"     type="string"    indexed="true"  stored="true"  multiValued="false" />
					<field name="pubdate"     type="tdate"    indexed="true"  stored="true"  multiValued="false" />
				 <!-- end SI4T Mandatory Fields -->
				 <!-- Fields configured in the standard SI4T TBB -->
				 <field name="title"     type="string"    indexed="true"  stored="true"  multiValued="false" />
				 <field name="publicationid"     type="tint"    indexed="true"  stored="true"  multiValued="false" />
				 <field name="schemaid"     type="tint"    indexed="true"  stored="true"  multiValued="false" />
				 <field name="itemtype"     type="tint"    indexed="true"  stored="true"  multiValued="false" />
				 <field name="parentsgid"     type="tint"    indexed="true"  stored="true"  multiValued="false" />
				 <field name="sgid"     type="tint"    indexed="true"  stored="true"  multiValued="true" />
				 <field name="type"     type="tint"    indexed="true"  stored="true"  multiValued="false" />
				 <field name="body" type="text_general"    indexed="true"  stored="true"  multiValued="true" />
		 <field name="_version_" type="long" indexed="true" stored="true"/>
		<!-- 
			place any other custom field configuration here 
			If for example a custom field 'customerid' is added through the SI4T TBB like so:
			<custom><customerid>200</customerid></custom>
			Then add this field to the Schema:
			<field name="customerid"     type="tint"    indexed="true"  stored="true"  multiValued="false" />
		-->

		<!-- catch all field needed to query on searches which do not have a field specified -->
		<field name="searchall" type="text_general"    indexed="true"  stored="true"  multiValued="true" />
		
		
		<!-- 
			Binary extraction example. This needs to be configured in two places:
				1. solrconfig.xml: 
					map binary content to a binary field and for instance
					map the title extracted from the binary to a BinaryEmbeddedTitle field
					An example is:
						<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
						<lst name="defaults">
						<str name="uprefix">binary_</str>
						<str name="fmap.content">binary_content</str>
						<str name="fmap.title">BinaryEmbeddedTitle</str>
						</lst>
						</requestHandler>
				2. schema.xml:
					create fields for the mapped field types in step 1.
		-->
		<!-- Binary extraction storage fields -->
		<field name="BinaryEmbeddedTitle"     type="string"    indexed="true"  stored="true"  multiValued="true" />
		<field name="binary_content" type="text_general"    indexed="true"  stored="true"  multiValued="false" />
		<field name="fileType" type="string"    indexed="true"  stored="true"  multiValued="false" />
		<field name="fileSize" type="string"    indexed="true"  stored="true"  multiValued="false" />
		<!-- ignore any other field, including anything other than binary_content -->
		<dynamicField name="*" type="ignored" />
		<dynamicField name="binary_*" type="ignored" />
	</fields>
	<!-- copy fields to enable default search -->
	<copyField source="*" dest="searchall" />
	<!-- field to use to determine and enforce document uniqueness. -->
	<uniqueKey>id</uniqueKey>
	<!-- field for the QueryParser to use when an explicit fieldname is absent -->
	<defaultSearchField>searchall</defaultSearchField>
	<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
	<solrQueryParser defaultOperator="OR"/>
</schema>
Clone this wiki locally