forked from NationalSecurityAgency/datawave
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikipedia-ingest-config.xml
145 lines (123 loc) · 4.5 KB
/
wikipedia-ingest-config.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>file.input.format</name>
<value>datawave.ingest.wikipedia.WikipediaInputFormat</value>
</property>
<property>
<name>data.name</name>
<value>wikipedia</value>
<description>This is the type of data being ingested.
</description>
</property>
<property>
<name>wikipedia.ingest.helper.class</name>
<value>datawave.ingest.wikipedia.WikipediaIngestHelper</value>
</property>
<property>
<name>wikipedia.reader.class</name>
<value>datawave.ingest.wikipedia.WikipediaRecordReader</value>
</property>
<property>
<name>wikipedia.handler.classes</name>
<value>datawave.ingest.wikipedia.WikipediaDataTypeHandler</value>
</property>
<property>
<name>wikipedia.data.replace.malformed.utf8</name>
<value>false</value>
</property>
<!-- ********************************************
Ingest Helper Rules
******************************************** -->
<property>
<name>wikipedia.data.category.token.fieldname.designator</name>
<value>_TOKEN</value>
</property>
<property>
<name>wikipedia.ingest.basis.enabled</name>
<value>false</value>
<description>Enable the use of Basis Technologies Rosette Linguistics Processor.
BT_ROOT must be set as an environment variable. BT_ROOT defines
the location of Rosette Language Platform.
</description>
</property>
<property>
<name>wikipedia.token.synonyms.create</name>
<value>true</value>
</property>
<!-- ********************************************
Indexing
******************************************** -->
<property>
<name>wikipedia.data.category.index</name>
<value>CONTRIBUTOR_USERNAME,CONTRIBUTOR_ID,PAGE_ID,PAGE_NAMESPACE,PAGE_RESTRICTIONS,PAGE_TITLE,REVISION_COMMENT_TERM_COUNT,REVISION_COMMENT_TOKEN,REVISION_ID,REVISION_MINOR,REVISION_PARENTID,REVISION_SHA1,REVISION_TEXT_TERM_COUNT,REVISION_TEXT_TOKEN,REVISION_TIMESTAMP</value>
<description>These are the fields to index (delimited by first separator).
</description>
</property>
<property>
<name>wikipedia.data.category.index.reverse</name>
<value>CONTRIBUTOR_USERNAME,PAGE_TITLE,PAGE_RESTRICTIONS,REVISION_TEXT_TOKEN,REVISION_COMMENT_TOKEN</value>
</property>
<property>
<name>wikipedia.data.category.index.only</name>
<value>REVISION_TEXT_TOKEN,REVISION_COMMENT_TOKEN</value>
</property>
<property>
<name>wikipedia.content.index.fields</name>
<value>REVISION_TEXT_TOKEN,REVISION_COMMENT_TOKEN</value>
</property>
<!-- ********************************************
Virtual Fields
******************************************** -->
<property>
<name>wikipedia.data.combine.start.separator</name>
<value><</value>
</property>
<property>
<name>wikipedia.data.combine.end.separator</name>
<value>></value>
</property>
<!-- ********************************************
Normalization
******************************************** -->
<property>
<name>wikipedia.data.default.type.class</name>
<value>datawave.data.type.LcNoDiacriticsType</value>
</property>
<property>
<name>wikipedia.PAGE_NAMESPACE.data.field.type.class</name>
<value>datawave.data.type.NumberType</value>
</property>
<property>
<name>wikipedia.PAGE_ID.data.field.type.class</name>
<value>datawave.data.type.NumberType</value>
</property>
<property>
<name>wikipedia.REVISION_ID.data.field.type.class</name>
<value>datawave.data.type.NumberType</value>
</property>
<property>
<name>wikipedia.REVISION_PARENTID.data.field.type.class</name>
<value>datawave.data.type.NumberType</value>
</property>
<property>
<name>wikipedia.CONTRIBUTOR_ID.data.field.type.class</name>
<value>datawave.data.type.NumberType</value>
</property>
<property>
<name>wikipedia.*_TERM_COUNT.data.field.type.class</name>
<value>datawave.data.type.NumberType</value>
</property>
<!-- The WikipediaRecordReader will automatically set ColumnVisibility to PUBLIC unless the property below is set-->
<!-- So using the property below is the only way to set the Column Visibility to something other than PUBLIC -->
<!-- <property> -->
<!-- <name>wikipedia.data.category.marking.default</name> -->
<!-- <value>PROTECTED</value> -->
<!-- <description>Default ColumnVisibility to be applied to fields/records if none provided in the data</description> -->
<!-- </property> -->
<property>
<name>wikipedia.data.default.normalization.failure.policy</name>
<value>LEAVE</value>
</property>
</configuration>