-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconfig.yaml
60 lines (54 loc) · 3.62 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
sbbget:
# for testing purposes we try to download one historical illustrated childrens' playbook with OCR
# this will download and create approx. 1.98 GB of test data
ppnListFile: "../ppn_lists/demo_document.csv"
# in case the PPN list contains PPNs without "PPN" prefix, it will be added. if a PPN starts with PPN as prefix the prefix will not be added.
addPPNPrefix: True
# STANDARD file download settings (will download images and fulltexts): retrievalScope=['TIFF','FULLTEXT']
# please not the the the retrieval scope overrides all of the following settings.
# if set to 'FULLTEXT', no images will be downloaded even if forceTitlePageDownload etc. is set.
retrievalScope: ['TIFF','FULLTEXT']
# TODO: add switch for the following types, default: FULLTEXT und PRESENTATION
# <mets:fileGrp USE="THUMBS"
# <mets:fileGrp USE="DEFAULT">
# <mets:fileGrp USE="FULLTEXT">
# <mets:fileGrp USE="PRESENTATION">
# should illustrations, found by the OCR, be extracted?
extractIllustrations: True
# determines file format for extracted images, if you want to keep max. quality use ".tif" instead
illustrationExportFileType: ".jpg"
# (recommended setting) create .tar files from the extracted illustrations and delete extracted illustrations afterwards
# facilitating distribution as a much fewer files will be created. however, this will slow down processing because of
# the packing overhead.
createTarBallOfExtractedIllustrations: True
# store title page thumbnails separately? (will be saved in illustrationExportFileType format) works only if skipDownloads=False or forceTitlePageDownload: True
storeExtraTitlePageThumbnails: True
# the maximum dimensions ot the thumbnail as a tuple [<width,height>] (aspect ratio remains intact)
titlePageThumbnailSize: [512,512]
# delete temporary files (will remove XML documents, OCR fulltexts and leave you alone with the extracted images
deleteTempFolders: False
# if True, downloaded full page TIFFs will be removed after illustration have been extracted (saves a lot of storage space)
deleteMasterTIFFs: False
# handy if a certain file set has been downloaded before and processing has to be limited to post-processing only
skipDownloads: False
# overrides skipDownloads to force the download of title pages (first pages will not be downloaded!)
forceTitlePageDownload : True
# enables verbose output during processing
verbose: True
# determines which ALTO elements (coming from the OCR) should be extracted
consideredAltoElements: ['{http://www.loc.gov/standards/alto/ns-v2#}Illustration']
#,'{http://www.loc.gov/standards/alto/ns-v2#}GraphicalElement']
# path to the log file which also stores information if the script run has been canceled and it should be resumed (in case of a large amount of downloads)
# if you want to force new downloads, just delete this file
logFileName : 'ppn_log.log'
# error log file name
errorLogFileName: "sbbget_error.log"
# parameters no-one outside the Berlin State Library will ever use
#
# setting this variable to true will disable SSL certificate verification - USE AT YOUR OWN RISK!
allowUnsafeSSLConnections_NEVER_USE_IN_PRODUCTION: False
# Berlin State Library internal setting
runningFromWithinStabi: False
# Stabi internal setup variants, may vary depending on the sub-net of the machine
# dev Windows: allowUnsafeSSLConnections_NEVER_USE_IN_PRODUCTION: True runningFromWithinStabi: True
# dev Linux: allowUnsafeSSLConnections_NEVER_USE_IN_PRODUCTION: False runningFromWithinStabi: True