-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathconfig-clarin-clarin.xml
158 lines (142 loc) · 7.32 KB
/
config-clarin-clarin.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
<!-- This is the configuration for CLARIN harvesting. -->
<config>
<!-- ### configuration settings ### -->
<settings>
<!-- Working directory. -->
<workdir>workspace</workdir>
<!-- Maximum number of attempts per record before giving up. -->
<max-retry-count>2</max-retry-count>
<!-- Delay between retries of a record (seconds). -->
<retry-delay>10</retry-delay>
<!-- Maximum number of concurrent harvester threads -->
<max-jobs>6</max-jobs>
<!-- Number of resources placed in the resource pool. -->
<resource-pool-size>4</resource-pool-size>
<!-- Default timeout (for connection and reading) for a single
http request in seconds. If unspecified, will be INFINITE. -->
<timeout>60</timeout>
<!-- File used to log harvesting times. -->
<state-file>state.xml</state-file>
<!-- File used to store endpoint -> directory mappings -->
<map-file>map.csv</map-file>
<!-- If this parameter is true, use incremental harvesting. For that to
work, state-file must be defined, the file must exist, and the provider
in question must have been harvested previously. If any of these conditions
is not fulfilled, this setting has no effect.-->
<incremental>false</incremental>
<scenario>ListRecords</scenario>
</settings>
<!-- ### output directories (referenced in the action section) ### -->
<directories>
<!-- When the attribute 'max-files' is non-zero, subdirectories
will be created to ensure no directory has more than that
number of files. -->
<!--<dir path="oai-rec" id="rec" max-files="0"/>-->
<dir path="oai-pmh" id="oai" max-files="0"/>
<dir path="results/cmdi-1_1" id="cmdi-1_1" max-files="0"/>
<dir path="results/cmdi" id="cmdi-1_2" max-files="0"/>
</directories>
<!-- ### actions to take on metadata formats (in order of preference) ### -->
<actions>
<!-- CMDI 1.2 -->
<format match="namespace" value="http://www.clarin.eu/cmd/1">
<action type="save" dir="oai" suffix=".xml"/>
<action type="split"/>
<!--<action type="save" dir="rec" suffix=".xml"/>-->
<action type="strip"/>
<action type="save" dir="cmdi-1_2" suffix=".xml"/>
</format>
<!-- CMDI 1.1 -->
<format match="namespace" value="http://www.clarin.eu/cmd/">
<action type="save" dir="oai" suffix=".xml"/>
<action type="split"/>
<!--<action type="save" dir="rec" suffix=".xml"/>-->
<action type="strip"/>
<action type="save" dir="cmdi-1_1" suffix=".xml"/>
<action type="transform" file="https://infra.clarin.eu/CMDI/1.x/upgrade/cmd-record-1_1-to-1_2.xsl" cache="cache"/>
<action type="save" dir="cmdi-1_2" suffix=".xml"/>
</format>
<!-- OLAC -->
<format match="prefix" value="olac">
<action type="save" dir="oai" suffix=".xml"/>
<action type="transform" file="resources/addOAISetName.xsl" cache="cache"/>
<action type="split"/>
<!--<action type="save" dir="rec" suffix=".xml"/>-->
<action type="transform" file="https://raw.githubusercontent.com/clarin-eric/metadata-conversion/olac-cmdi/olac-cmdi/olac2cmdi.xsl"/>
<action type="save" dir="cmdi-1_1" suffix=".xml"/>
<action type="transform" file="https://infra.clarin.eu/CMDI/1.x/upgrade/cmd-record-1_1-to-1_2.xsl" cache="cache"/>
<action type="save" dir="cmdi-1_2" suffix=".xml"/>
</format>
<!-- DataCite -->
<format match="prefix" value="oai_datacite">
<action type="save" dir="oai" suffix=".xml"/>
<action type="transform" file="resources/addOAISetName.xsl" cache="cache"/>
<action type="split"/>
<action type="strip"/>
<!-- <action type="save" dir="rec" suffix=".xml"/> -->
<action type="transform" file="https://raw.githubusercontent.com/clarin-eric/metadata-conversion/master/datacite-cmdi/datacite_to_cmdi.xsl"/>
<action type="save" dir="cmdi-1_2" suffix=".xml"/>
</format>
<format match="prefix" value="datacite">
<action type="save" dir="oai" suffix=".xml"/>
<action type="transform" file="resources/addOAISetName.xsl" cache="cache"/>
<action type="split"/>
<action type="strip"/>
<!-- <action type="save" dir="rec" suffix=".xml"/> -->
<action type="transform" file="https://raw.githubusercontent.com/clarin-eric/metadata-conversion/master/datacite-cmdi/datacite_to_cmdi.xsl"/>
<action type="save" dir="cmdi-1_2" suffix=".xml"/>
</format>
<!-- Dublin Core -->
<format match="prefix" value="oai_dc">
<action type="save" dir="oai" suffix=".xml"/>
<action type="transform" file="resources/addOAISetName.xsl" cache="cache"/>
<action type="split"/>
<!--<action type="save" dir="rec" suffix=".xml"/>-->
<action type="transform" file="https://raw.githubusercontent.com/clarin-eric/metadata-conversion/olac-cmdi/olac-cmdi/olac2cmdi.xsl"/>
<action type="save" dir="cmdi-1_1" suffix=".xml"/>
<action type="transform" file="https://infra.clarin.eu/CMDI/1.x/upgrade/cmd-record-1_1-to-1_2.xsl" cache="cache"/>
<action type="save" dir="cmdi-1_2" suffix=".xml"/>
</format>
</actions>
<!-- ### list of providers ### -->
<providers>
<import>
<registry url="https://centres.clarin.eu/api/model"/>
<includeOaiPmhSetTypes>*</includeOaiPmhSetTypes>
<excludeOaiPmhSetTypes>WebLicht</excludeOaiPmhSetTypes>
<!--
Block harvesting from a specific provider defined in the registry
element by supplying its endpoint URL as an child element tagged
'exclude'. You can add multiple exclude elements. Please note that
the exclusion only applies to the registry element, not to the
provider elements.
-->
<!--
<exclude url=""/>
-->
<!--
Add endpoint specific configuration to providers defined in the registry.
-->
<!-- MPI sometimes needs a bit more time -->
<config url="https://archive.mpi.nl/oai2" max-retry-count="5" retry-delay="10 30 60 600"/>
<!-- ACDH/ARCHE needs some time to gerenate the metadata -->
<config url="https://arche.acdh.oeaw.ac.at/oaipmh/" timeout="300" scenario="ListIdentifiers"/>
<!-- Leipzig doesn't like too much time between calls within one session, GetRecord calls don't care -->
<config url="https://clarinoai.informatik.uni-leipzig.de/oaiprovider/oai" scenario="ListIdentifiers"/>
<!-- BAS runs sometimes into memory problems when its big CMD records get processed in batch -->
<config url="http://www.phonetik.uni-muenchen.de/cgi-bin/BASRepository/oaipmh/oai.pl" scenario="ListIdentifiers" timeout="300"/>
<!-- CELR (Tartu) throws an error with 'verb=ListRecords&metadataPrefix=cmdi' but GetRecord calls seem to work -->
<config url="https://metashare.ut.ee/oai_pmh/" scenario="ListIdentifiers"/>
</import>
<!-- Virtual Collection Registry -->
<provider url="https://collections.clarin.eu/oai"/>
<!--
FIN-CLARIN is harvested from the same endpoint as CLARINO but is defined separately here to avoid being listed under the same national project.
Note that the double slash in the URL is not a typo, but is there to meet the unique endpoint URL requirement.
See <https://github.com/clarin-eric/metadata-curation/issues/12>.
-->
<provider url="https://clarino.uib.no//oai" name="FIN-CLARIN">
<set>FIN-CLARIN</set>
</provider>
</providers>
</config>